more mass renaming
This commit is contained in:
@@ -0,0 +1,802 @@
|
||||
% @doc
|
||||
% Ref: so_scan.erl
|
||||
%
|
||||
% This file contains a sophia tokenizer written in straightforward erlang with data
|
||||
% types that are sane.
|
||||
%
|
||||
% For MVP it mimics the behavior of so_scan exactly, in terms of like what its
|
||||
% definition of a token is and so on.
|
||||
%
|
||||
% gsc_so_scan.erl contains a compatibility layer that should agree with so_scan
|
||||
% exactly. It converts the data types here to the shapes that so_scan outputs.
|
||||
%
|
||||
% This is for two reasons:
|
||||
%
|
||||
% 1. in order to enable testing the two modules against each other, and
|
||||
% 2. to future-proof in case we decide to incrementally incorporate the gsc
|
||||
% code into the legacy sophia compiler
|
||||
% @end
|
||||
-module(gs_tokens).
|
||||
|
||||
% meta
|
||||
-export([
|
||||
token_shapes_parse_order/0,
|
||||
kwds/0
|
||||
]).
|
||||
|
||||
-export([
|
||||
take_while/2,
|
||||
take_while/3,
|
||||
take_block/1,
|
||||
take_block_item/1,
|
||||
strings/2,
|
||||
slurp_plist/1
|
||||
]).
|
||||
|
||||
% token slurping
|
||||
-export([
|
||||
indent_level/1,
|
||||
is_significant/1,
|
||||
filter_significant/1,
|
||||
significant_tokens/1,
|
||||
tokens_from_iolist/1,
|
||||
tokens/1,
|
||||
slurp_token/2,
|
||||
slurp_token_shapes/3,
|
||||
slurp_token_of_shape/3,
|
||||
new_pos/2
|
||||
]).
|
||||
|
||||
-include("$gsc_include/gsc.hrl").
|
||||
|
||||
|
||||
%=======================================================
|
||||
% API: functions
|
||||
%=======================================================
|
||||
|
||||
-spec strings(N, Tokens) -> AtMostNStrings
|
||||
when N :: non_neg_integer(),
|
||||
Tokens :: [tk()],
|
||||
AtMostNStrings :: [string()].
|
||||
% @doc return the strings of the first N tokens
|
||||
|
||||
strings(N, [#tk{str = S} | Rest]) when is_integer(N), N >= 1 ->
|
||||
[S | strings(N-1, Rest)];
|
||||
strings(_, []) ->
|
||||
[];
|
||||
strings(0, _) ->
|
||||
[].
|
||||
|
||||
|
||||
% used by parser
|
||||
%
|
||||
% a block is a column-delimited list of block items
|
||||
%
|
||||
% BLOCK =
|
||||
% foo
|
||||
% ...
|
||||
% bar
|
||||
% ...
|
||||
% baz
|
||||
% ...
|
||||
%
|
||||
% BLOCK_ITEM =
|
||||
% foo
|
||||
% ...
|
||||
|
||||
-spec take_block(Tokens) -> {BlockTokens, Rest}
|
||||
when Tokens :: [tk()],
|
||||
BlockTokens :: Tokens,
|
||||
Rest :: Tokens.
|
||||
% @doc
|
||||
% takes all tokens whose column position is >= the column position of
|
||||
% the head token
|
||||
|
||||
take_block([H = #tk{pos = {_, BlkCol}} | T]) ->
|
||||
TokenInBlock =
|
||||
fun(#tk{pos = {_, TkCol}}) ->
|
||||
BlkCol =< TkCol
|
||||
end,
|
||||
take_while(TokenInBlock, [H], T);
|
||||
take_block([]) ->
|
||||
{[], []}.
|
||||
|
||||
|
||||
|
||||
-spec take_block_item(Tokens) -> {ItemTokens, Rest}
|
||||
when Tokens :: [tk()],
|
||||
ItemTokens :: Tokens,
|
||||
Rest :: Tokens.
|
||||
% @doc
|
||||
% takes all tokens whose column position is > the column position of
|
||||
% the head token
|
||||
|
||||
take_block_item([H = #tk{pos = {_, ItemCol}} | T]) ->
|
||||
TokenInItem =
|
||||
fun(#tk{pos = {_, TkCol}}) ->
|
||||
ItemCol < TkCol
|
||||
end,
|
||||
take_while(TokenInItem, [H], T);
|
||||
take_block_item([]) ->
|
||||
{[], []}.
|
||||
|
||||
|
||||
|
||||
-spec slurp_plist(Tokens) -> Result
|
||||
when Tokens :: [Token],
|
||||
Result :: {slurp, PList :: Tokens, After :: Tokens}
|
||||
| {error, Mismatch},
|
||||
Mismatch :: {fixme, mismatch, OpenStack, ClosedBy},
|
||||
OpenStack :: Tokens,
|
||||
ClosedBy :: none | {value, Token},
|
||||
Token :: tk().
|
||||
|
||||
% @doc
|
||||
% the verbiage here is `slurp' rather than `take' because we insist on
|
||||
% delimiter matching.
|
||||
%
|
||||
% typical happy path:
|
||||
% "(foo, bar) baz" ~> {slurp, "(foo, bar)", "baz"}
|
||||
% "() baz" ~> {slurp, "()", "baz"}
|
||||
% "foo () baz" ~> {slurp, "", "foo () baz"}
|
||||
% "(foo, bar) baz" ~> {slurp, "(foo, bar)", "baz"}
|
||||
%
|
||||
% typical sad path:
|
||||
% "(foo, bar]" ~> {mismatch, ["("], {value, "]"}}
|
||||
% "(foo, bar" ~> {mismatch, ["("], none}
|
||||
% "([foo, bar)" ~> {mismatch, ["[", "("], {value, ")"}}
|
||||
%
|
||||
% counterintuitive:
|
||||
% "[foo, bar) baz" ~> {slurp, "", "[foo, bar) baz"}
|
||||
% "~!!\inv4l1d syntax" ~> {slurp, "", "~!!\inv4l1d syntax"}
|
||||
% "(foo, bar)(baz)" ~> {slurp, "(foo bar)", "(baz)"}
|
||||
%
|
||||
% the only "syntax checking" occurring is making sure the delimiter
|
||||
% stack pushes and pops properly
|
||||
%
|
||||
% please note that on mismatch, the list of open delimiters is
|
||||
% returned in STACK order, meaning the most recent open delimiters
|
||||
% first. this is more convenient for programs, but might be
|
||||
% counterintuitive to end-users (who are programmers, entirely
|
||||
% unfamiliar with notions like stacks and open/close delimiters)
|
||||
|
||||
slurp_plist([Hd = #tk{str = "("} | Tl]) ->
|
||||
slurp_dlist([Hd], [Hd], Tl);
|
||||
slurp_plist(Tks) ->
|
||||
{slurp, [], Tks}.
|
||||
|
||||
|
||||
% happy terminal case: stack popped entirely
|
||||
slurp_dlist(All, [], NewTokens) ->
|
||||
{slurp, lists:reverse(All), NewTokens};
|
||||
% WMA stack is nonempty
|
||||
% happy cases of opens getting popped
|
||||
slurp_dlist(All, [#tk{str = "("} | NewOpen],
|
||||
[#tk{str = ")"} = Tk | NewTks]) ->
|
||||
slurp_dlist([Tk | All], NewOpen, NewTks);
|
||||
slurp_dlist(All, [#tk{str = "["} | NewOpen],
|
||||
[#tk{str = "]"} = Tk | NewTks]) ->
|
||||
slurp_dlist([Tk | All], NewOpen, NewTks);
|
||||
slurp_dlist(All, [#tk{str = "{"} | NewOpen],
|
||||
[#tk{str = "}"} = Tk | NewTks]) ->
|
||||
slurp_dlist([Tk | All], NewOpen, NewTks);
|
||||
% happy: open delimiters getting pushed
|
||||
slurp_dlist(All, Opens, [#tk{str = "("} = Tk | NewTks]) ->
|
||||
slurp_dlist([Tk | All], [Tk | Opens], NewTks);
|
||||
slurp_dlist(All, Opens, [#tk{str = "["} = Tk | NewTks]) ->
|
||||
slurp_dlist([Tk | All], [Tk | Opens], NewTks);
|
||||
slurp_dlist(All, Opens, [#tk{str = "{"} = Tk | NewTks]) ->
|
||||
slurp_dlist([Tk | All], [Tk | Opens], NewTks);
|
||||
% sad: mismatch cases
|
||||
slurp_dlist(All, Opens, []) ->
|
||||
{error, {fixme, mismatch, Opens, none}};
|
||||
slurp_dlist(All, Opens, [#tk{str = "}"} = BadClose | _]) ->
|
||||
{error, {fixme, mismatch, Opens, {value, BadClose}}};
|
||||
slurp_dlist(All, Opens, [#tk{str = "]"} = BadClose | _]) ->
|
||||
{error, {fixme, mismatch, Opens, {value, BadClose}}};
|
||||
slurp_dlist(All, Opens, [#tk{str = ")"} = BadClose | _]) ->
|
||||
{error, {fixme, mismatch, Opens, {value, BadClose}}};
|
||||
% general case: non-terminal token gets pushed
|
||||
slurp_dlist(All, Opens, [Tk | NewTks]) ->
|
||||
slurp_dlist([Tk | All], Opens, NewTks).
|
||||
|
||||
|
||||
%-------------------------------------------------------
|
||||
% API: meta info
|
||||
%
|
||||
% This is parse order definition, list of keywords, etc
|
||||
%
|
||||
% -export([
|
||||
% token_shapes_parse_order/0,
|
||||
% kwds/0
|
||||
% ]).
|
||||
%-------------------------------------------------------
|
||||
|
||||
-spec token_shapes_parse_order() -> [tk_shape()].
|
||||
% @doc
|
||||
% list of sophia token shapes in parse order (if an earlier shape matches, the later
|
||||
% shape isn't even checked)
|
||||
%
|
||||
%
|
||||
% Rules =
|
||||
% %% Comments and whitespace
|
||||
% [ CommentStart
|
||||
% , {"//.*", skip()}
|
||||
% , {WS, skip()}
|
||||
%
|
||||
% %% Special characters
|
||||
% , {"\\.\\.|[,.;()\\[\\]{}]", symbol()}
|
||||
%
|
||||
% %% Literals
|
||||
% , {CHAR, token(char, fun parse_char/1)}
|
||||
% , {STRING, token(string, fun parse_string/1)}
|
||||
% , {HEX, token(hex, fun parse_hex/1)}
|
||||
% , {INT, token(int, fun parse_int/1)}
|
||||
% , {BYTES, token(bytes, fun parse_bytes/1)}
|
||||
%
|
||||
% %% Identifiers (qualified first!)
|
||||
% , {QID, token(qid, fun(S) -> string:tokens(S, ".") end)}
|
||||
% , {QCON, token(qcon, fun(S) -> string:tokens(S, ".") end)}
|
||||
% , {TVAR, token(tvar)}
|
||||
% , override({ID, token(id)}, {KW, symbol()}) %% Keywords override identifiers. Need to
|
||||
% , {CON, token(con)} %% use override to avoid lexing "lettuce"
|
||||
% %% as ['let', {id, "tuce"}].
|
||||
% %% Operators
|
||||
% , {OP, symbol()}
|
||||
% ],
|
||||
% @end
|
||||
|
||||
token_shapes_parse_order() ->
|
||||
% written in this style to be maximally editable
|
||||
lists:flatten([
|
||||
% comments and whitespace
|
||||
lcom, bcom, ws,
|
||||
punct,
|
||||
% literals
|
||||
char, string, int16, int10, bytes,
|
||||
ak, ct, sg,
|
||||
% qualified names need to go ahead of unqualifieds
|
||||
qid, qcon,
|
||||
tvar,
|
||||
% keywords need to be parsed ahead of ids
|
||||
kwd, id,
|
||||
con,
|
||||
% ops [=, =>, >>], punctuation (parens/braces)
|
||||
op
|
||||
]).
|
||||
|
||||
|
||||
-spec kwds() -> list(string()).
|
||||
% @doc list of sophia kwds
|
||||
|
||||
kwds() ->
|
||||
["contract", "include", "let", "switch", "type", "record", "datatype",
|
||||
"if", "elif", "else", "function", "stateful", "payable", "true", "false",
|
||||
"mod", "public", "entrypoint", "private", "indexed", "namespace",
|
||||
"interface", "main", "using", "as", "for", "hiding", "band", "bor",
|
||||
"bxor", "bnot"].
|
||||
|
||||
|
||||
%-------------------------------------------------------
|
||||
% API: token slurping
|
||||
%
|
||||
% -export([
|
||||
% tokens/1,
|
||||
% slurp_token/1,
|
||||
% slurp_token_shapes/2,
|
||||
% slurp_token_of_shape/2
|
||||
% ]).
|
||||
%-------------------------------------------------------
|
||||
|
||||
% Token accessors
|
||||
-spec indent_level(tk()) -> pos_integer().
|
||||
|
||||
indent_level(#tk{pos = {_, IndentLevel}}) ->
|
||||
IndentLevel.
|
||||
|
||||
|
||||
|
||||
-spec significant_tokens(SrcStr) -> Result
|
||||
when SrcStr :: iolist(),
|
||||
Result :: {ok, Tokens}
|
||||
| {error, gsc_err()},
|
||||
Tokens :: [tk()].
|
||||
|
||||
significant_tokens(SrcStr) ->
|
||||
case tokens(SrcStr) of
|
||||
{ok, Tokens} ->
|
||||
{ok, filter_significant(Tokens)};
|
||||
Error ->
|
||||
Error
|
||||
end.
|
||||
|
||||
|
||||
|
||||
-spec filter_significant(Tokens) -> SignificantTokens
|
||||
when Tokens :: [tk()],
|
||||
SignificantTokens :: Tokens.
|
||||
|
||||
filter_significant(Tokens) ->
|
||||
lists:filter(fun is_significant/1, Tokens).
|
||||
|
||||
|
||||
|
||||
-spec is_significant(Token) -> boolean()
|
||||
when Token :: tk().
|
||||
|
||||
is_significant(#tk{shape = bcom}) -> false;
|
||||
is_significant(#tk{shape = lcom}) -> false;
|
||||
is_significant(#tk{shape = ws}) -> false;
|
||||
is_significant(_) -> true.
|
||||
|
||||
|
||||
-spec tokens_from_iolist(SrcStr) -> Result when
|
||||
SrcStr :: iolist(),
|
||||
Result :: {ok, Tokens}
|
||||
| {error, gsc_err()},
|
||||
Tokens :: [tk()].
|
||||
|
||||
% @doc alias for tokens/1
|
||||
tokens_from_iolist(S) -> tokens(S).
|
||||
|
||||
|
||||
|
||||
-spec tokens(SrcStr) -> Result
|
||||
when SrcStr :: iolist(),
|
||||
Result :: {ok, Tokens}
|
||||
| {error, gsc_err()},
|
||||
Tokens :: [tk()].
|
||||
% @doc
|
||||
% Recursively parse all tokens off the front end of the string. `Rest' is
|
||||
% the first tail of the string for which no token parser succeeds.
|
||||
%
|
||||
% Semantically, `Rest'` being nonempty amounts to the presence of an illegal
|
||||
% character.
|
||||
|
||||
tokens(S) ->
|
||||
% defensive normalization
|
||||
tokens([], {1, 1}, unicode:characters_to_nfc_list(S)).
|
||||
|
||||
tokens(Stack, _FinalPos, "") ->
|
||||
{ok, lists:reverse(Stack)};
|
||||
tokens(Stack, Pos, SrcStr) ->
|
||||
case slurp_token(Pos, SrcStr) of
|
||||
{tokmatch, NewToken = #tk{str = TokStr},
|
||||
NewSrcStr} ->
|
||||
NewPos = new_pos(Pos, TokStr),
|
||||
tokens([NewToken | Stack], NewPos, NewSrcStr);
|
||||
no_tokmatch ->
|
||||
PrevTokens = lists:reverse(Stack),
|
||||
Err = #gsc_err_no_tokmatch{prev_tokens = PrevTokens,
|
||||
break_pos = Pos,
|
||||
rest = SrcStr},
|
||||
{error, Err};
|
||||
% FIXME so_scan bad
|
||||
% this is so fucking stupid
|
||||
% so_scan for some reason allows unterminated block comments at
|
||||
% the end of files
|
||||
%
|
||||
% for now we're just going to agree with so_scan
|
||||
{ierr, unterminated_block_comment} ->
|
||||
PrevTokens = lists:reverse(Stack),
|
||||
Err = #gsc_err_bcom_unterminated{prev_tokens = PrevTokens,
|
||||
break_pos = Pos,
|
||||
rest = SrcStr},
|
||||
{error, Err};
|
||||
Error = {error, _} ->
|
||||
Error
|
||||
end.
|
||||
|
||||
% alright some bullshit here
|
||||
%
|
||||
% we're computing the line/column position of each string
|
||||
%
|
||||
% however this is meant to be compatible with so_scan, so it's a bit wonky
|
||||
% because regex list bullshit.
|
||||
%
|
||||
% recall that so_scan operates on the list representation of the utf-8 encoded
|
||||
% bytes; this is different than on a list of bignum codepoints (e.g.
|
||||
% unicode:characters_to_nfc_list(Bytes)); let's suppose some stupid complicated
|
||||
% foreign character which a sane language would simply criminalize has list
|
||||
% representation [ABC], but byte representation <<A,B,C>>
|
||||
%
|
||||
% as far as so_scan is concerned, this means the character ABC consumes 3
|
||||
% columns. the only exception is tab characters, which always fast-forward to
|
||||
% the next tab stop, which is 1-indexed because god hates all of us
|
||||
%
|
||||
% so the tab-stops are
|
||||
% 1 9 17 25 33 ...
|
||||
%
|
||||
% column position is determined in all cases by byte order, EXCEPT for $\t
|
||||
% which goes to the next tab stop
|
||||
%
|
||||
% so in general, for the token string, we need to convert to bytes first,
|
||||
% then handle `\t` bytes as a special case
|
||||
%
|
||||
% again in the tokenizer context, we're assuming that the input to our
|
||||
% tokenizer is an nfc-list which has a flat list of each unicode character in
|
||||
% codepoint form
|
||||
%
|
||||
% here we're just converting it to byte form, then computing columns based on
|
||||
% bytes
|
||||
new_pos(OldPos, TokStr) ->
|
||||
new_pos_bytes(unicode:characters_to_binary(TokStr), OldPos).
|
||||
|
||||
% newline just goes to {L+1, 1}
|
||||
new_pos_bytes(<<$\n:8, Rest/bytes>>, _Pos = {L, _}) ->
|
||||
NewPos = {L+1, 1},
|
||||
new_pos_bytes(Rest, NewPos);
|
||||
new_pos_bytes(<<$\t:8, Rest/bytes>>, _Pos = {Linum, Colnum1}) ->
|
||||
% stinky wet sweaty robots need 1-based indexing
|
||||
% so tab stops are at
|
||||
% 1 9 17 25
|
||||
% super awesome dry silicon robots use 0-based indexing
|
||||
% so tab stops are at
|
||||
% 0 8 16 25
|
||||
Colnum0 = Colnum1 - 1,
|
||||
% 0 based is based
|
||||
NextTabstop0 = next_tabstop8(Colnum0),
|
||||
NextTabstop1 = NextTabstop0 + 1,
|
||||
NextPos = {Linum, NextTabstop1},
|
||||
new_pos_bytes(Rest, NextPos);
|
||||
new_pos_bytes(<<_:8, Rest/bytes>>, _Pos = {Linum, Colnum1}) ->
|
||||
% in general advance by 1
|
||||
new_pos_bytes(Rest, {Linum, Colnum1 + 1});
|
||||
new_pos_bytes(<<>>, FinalPos) ->
|
||||
FinalPos.
|
||||
|
||||
% 0 8 16 24 etc
|
||||
% 0*8 1*8 2*8 3*8 etc
|
||||
next_tabstop8(Col0) when Col0 >= 0 ->
|
||||
% Col0 = PrevTabQ*8 + PrevTabR
|
||||
PrevTabQ = Col0 div 8,
|
||||
PrevTabR = Col0 rem 8,
|
||||
Col0 = PrevTabQ*8 + PrevTabR,
|
||||
NextTabQ = PrevTabQ + 1,
|
||||
NextTabCol0 = NextTabQ*8,
|
||||
NextTabCol0.
|
||||
|
||||
%% copied from so_scan_lib.erl just to match behavior
|
||||
%-define(TAB_SIZE, 8).
|
||||
%
|
||||
%next_pos([], P) -> P;
|
||||
%next_pos([$\n | S], {L, _}) -> next_pos(S, {L + 1, 1});
|
||||
%next_pos([$\t | S], {L, C}) -> next_pos(S, {L, (C + ?TAB_SIZE - 1) div ?TAB_SIZE * ?TAB_SIZE + 1});
|
||||
%next_pos([_ | S], {L, C}) -> next_pos(S, {L, C + 1}).
|
||||
|
||||
|
||||
|
||||
-spec slurp_token(Pos, SrcStr) -> Result
|
||||
when Pos :: tk_pos(),
|
||||
SrcStr :: string(),
|
||||
Result :: {tokmatch, Token, Rest}
|
||||
| no_tokmatch
|
||||
| {error, gsc_err()}
|
||||
| {ierr, unterminated_block_comment},
|
||||
Token :: tk(),
|
||||
Rest :: string().
|
||||
% @doc
|
||||
% grab a single token off the front of the string according to
|
||||
% `token_shapes_parse_order/0'
|
||||
|
||||
slurp_token(Pos, SrcStr) ->
|
||||
% this is the easiest format if i need to fuck with it
|
||||
slurp_token_shapes(token_shapes_parse_order(), Pos, SrcStr).
|
||||
|
||||
|
||||
|
||||
-spec slurp_token_shapes(ParseOrder, Pos, SrcStr) -> Result
|
||||
when ParseOrder :: [tk_shape()],
|
||||
Pos :: tk_pos(),
|
||||
SrcStr :: string(),
|
||||
Result :: {tokmatch, Token, Rest}
|
||||
| no_tokmatch
|
||||
| {error, gsc_err()}
|
||||
| {ierr, unterminated_block_comment},
|
||||
Token :: tk(),
|
||||
Rest :: string().
|
||||
% @doc
|
||||
% grab a single token off the front of the string according to
|
||||
% `token_shapes_parse_order/0'
|
||||
|
||||
slurp_token_shapes([TokenType | TTs], Pos, SrcStr) ->
|
||||
case slurp_token_of_shape(TokenType, Pos, SrcStr) of
|
||||
Match = {tokmatch, _, _} -> Match;
|
||||
no_tokmatch -> slurp_token_shapes(TTs, Pos, SrcStr);
|
||||
IErr = {ierr, _} -> IErr;
|
||||
Error = {error, _} -> Error
|
||||
end;
|
||||
slurp_token_shapes([], _Pos, _SrcStr) ->
|
||||
no_tokmatch.
|
||||
|
||||
|
||||
-spec slurp_token_of_shape(TokenType, Pos, SrcStr) -> MaybeToken
|
||||
when TokenType :: tk_shape(),
|
||||
Pos :: tk_pos(),
|
||||
SrcStr :: string(),
|
||||
MaybeToken :: {tokmatch, Token, Rest}
|
||||
| no_tokmatch
|
||||
| {error, gsc_err()}
|
||||
| {ierr, unterminated_block_comment},
|
||||
Token :: tk(),
|
||||
Rest :: string().
|
||||
% @doc
|
||||
% match a sophia token of a given shape off the front of the string
|
||||
% @end
|
||||
|
||||
% COMMENTS AND WHITESPACE: lcom, bcom, ws
|
||||
%
|
||||
% sophia line comment
|
||||
%
|
||||
% i am not going to bother writing a string matcher thing for this
|
||||
% FIXME: make a string matcher for line comments
|
||||
slurp_token_of_shape(lcom, Pos, SrcStr) ->
|
||||
case SrcStr of
|
||||
"//" ++ _ ->
|
||||
{Line, Rest} = takeline("", SrcStr),
|
||||
Token = #tk{shape = lcom,
|
||||
pos = Pos,
|
||||
str = Line},
|
||||
{tokmatch, Token, Rest};
|
||||
_ ->
|
||||
no_tokmatch
|
||||
end;
|
||||
% Block comments cannot have a string matcher because they have a whole stack
|
||||
% thing keeping track of depth because of nested block comments
|
||||
slurp_token_of_shape(bcom, Pos, SrcStr0) ->
|
||||
case SrcStr0 of
|
||||
"/*" ++ SrcStr1 ->
|
||||
case bcom("/*", 1, SrcStr1) of
|
||||
{ok, CommentStr, SrcStr2} ->
|
||||
Token = #tk{shape = bcom,
|
||||
pos = Pos,
|
||||
str = CommentStr},
|
||||
{tokmatch, Token, SrcStr2};
|
||||
Error ->
|
||||
Error
|
||||
end;
|
||||
_ ->
|
||||
no_tokmatch
|
||||
end;
|
||||
slurp_token_of_shape(ws, Pos, SrcStr) ->
|
||||
WhitespaceMatcher = gs_strmatch:smr_sf_ws(),
|
||||
case gs_strmatch:match(WhitespaceMatcher, SrcStr) of
|
||||
no_strmatch ->
|
||||
no_tokmatch;
|
||||
{strmatch, WS, Rest} ->
|
||||
Token = #tk{shape = ws,
|
||||
pos = Pos,
|
||||
str = WS},
|
||||
{tokmatch, Token, Rest}
|
||||
end;
|
||||
% KEYWORDS, OPERATORS, PUNCTUATION: kwd, op, punct
|
||||
%
|
||||
% all the kwds are valid ids, so we match as an id and then check if it's a
|
||||
% kwd
|
||||
%
|
||||
% kwds are allowed to be prefixes for user-defined variable names; e.g.
|
||||
% "lettuce" should be parsed as an id, not as ["let", "tuce"]; for this reason
|
||||
% we need to be careful with greedily parsing kwds
|
||||
%
|
||||
% we know kwds are always ids, so we parse it as an id and see if it's one
|
||||
% of the kwds
|
||||
slurp_token_of_shape(kwd, Pos, SrcStr) ->
|
||||
case slurp_token_of_shape(id, Pos, SrcStr) of
|
||||
{tokmatch, IdTok = #tk{str = IdStr}, Rest} ->
|
||||
case lists:member(IdStr, kwds()) of
|
||||
false ->
|
||||
no_tokmatch;
|
||||
true ->
|
||||
KwTok = IdTok#tk{shape = kwd},
|
||||
{tokmatch, KwTok, Rest}
|
||||
end;
|
||||
no_tokmatch ->
|
||||
no_tokmatch
|
||||
end;
|
||||
slurp_token_of_shape(op, Pos, SrcStr) ->
|
||||
case gs_strmatch:match(gs_strmatch:smr_sf_op(), SrcStr) of
|
||||
{strmatch, Str, Rest} ->
|
||||
Token = #tk{shape = op, pos = Pos, str = Str},
|
||||
{tokmatch, Token, Rest};
|
||||
no_strmatch ->
|
||||
no_tokmatch
|
||||
end;
|
||||
slurp_token_of_shape(punct, Pos, SrcStr) ->
|
||||
case gs_strmatch:match(gs_strmatch:smr_sf_punct(), SrcStr) of
|
||||
{strmatch, Str, Rest} ->
|
||||
Token = #tk{shape = punct, pos = Pos, str = Str},
|
||||
{tokmatch, Token, Rest};
|
||||
no_strmatch ->
|
||||
no_tokmatch
|
||||
end;
|
||||
% SOPHIA VARIABLE NAMES: id, con, qid, qcon, tvar
|
||||
slurp_token_of_shape(id, Pos, SrcStr) ->
|
||||
case gs_strmatch:match(gs_strmatch:smr_sf_id(), SrcStr) of
|
||||
{strmatch, IdStr, Rest} ->
|
||||
Token = #tk{shape = id, pos = Pos, str = IdStr},
|
||||
{tokmatch, Token, Rest};
|
||||
no_strmatch ->
|
||||
no_tokmatch
|
||||
end;
|
||||
slurp_token_of_shape(con, Pos, SrcStr) ->
|
||||
case gs_strmatch:match(gs_strmatch:smr_sf_con(), SrcStr) of
|
||||
{strmatch, Str, Rest} ->
|
||||
Token = #tk{shape = con, pos = Pos, str = Str},
|
||||
{tokmatch, Token, Rest};
|
||||
no_strmatch ->
|
||||
no_tokmatch
|
||||
end;
|
||||
slurp_token_of_shape(qid, Pos, SrcStr) ->
|
||||
case gs_strmatch:match(gs_strmatch:smr_sf_qid(), SrcStr) of
|
||||
{strmatch, Str, Rest} ->
|
||||
Token = #tk{shape = qid, pos = Pos, str = Str},
|
||||
{tokmatch, Token, Rest};
|
||||
no_strmatch ->
|
||||
no_tokmatch
|
||||
end;
|
||||
slurp_token_of_shape(qcon, Pos, SrcStr) ->
|
||||
case gs_strmatch:match(gs_strmatch:smr_sf_qcon(), SrcStr) of
|
||||
{strmatch, Str, Rest} ->
|
||||
Token = #tk{shape = qcon, pos = Pos, str = Str},
|
||||
{tokmatch, Token, Rest};
|
||||
no_strmatch ->
|
||||
no_tokmatch
|
||||
end;
|
||||
slurp_token_of_shape(tvar, Pos, SrcStr) ->
|
||||
case gs_strmatch:match(gs_strmatch:smr_sf_tvar(), SrcStr) of
|
||||
{strmatch, Str, Rest} ->
|
||||
Token = #tk{shape = tvar, pos = Pos, str = Str},
|
||||
{tokmatch, Token, Rest};
|
||||
no_strmatch ->
|
||||
no_tokmatch
|
||||
end;
|
||||
slurp_token_of_shape(int16, Pos, SrcStr) ->
|
||||
case gs_strmatch:match(gs_strmatch:smr_sf_int16(), SrcStr) of
|
||||
{strmatch, Str, Rest} ->
|
||||
Token = #tk{shape = int16, pos = Pos, str = Str},
|
||||
{tokmatch, Token, Rest};
|
||||
no_strmatch ->
|
||||
no_tokmatch
|
||||
end;
|
||||
slurp_token_of_shape(int10, Pos, SrcStr) ->
|
||||
case gs_strmatch:match(gs_strmatch:smr_sf_int10(), SrcStr) of
|
||||
{strmatch, Str, Rest} ->
|
||||
Token = #tk{shape = int10, pos = Pos, str = Str},
|
||||
{tokmatch, Token, Rest};
|
||||
no_strmatch ->
|
||||
no_tokmatch
|
||||
end;
|
||||
% LITERAL PARSERS: char, string, hex, int, bytes10, bytes16,
|
||||
% ak, ct, sg
|
||||
%
|
||||
% char: sophia char literal
|
||||
slurp_token_of_shape(ak, Pos, SrcStr) ->
|
||||
StringMatcher = gs_strmatch:smr_sf_ak(),
|
||||
case gs_strmatch:match(StringMatcher, SrcStr) of
|
||||
no_strmatch ->
|
||||
no_tokmatch;
|
||||
{strmatch, TokenStr, Rest} ->
|
||||
Token = #tk{shape = ak, pos = Pos, str = TokenStr},
|
||||
{tokmatch, Token, Rest}
|
||||
end;
|
||||
slurp_token_of_shape(ct, Pos, SrcStr) ->
|
||||
StringMatcher = gs_strmatch:smr_sf_ct(),
|
||||
case gs_strmatch:match(StringMatcher, SrcStr) of
|
||||
no_strmatch ->
|
||||
no_tokmatch;
|
||||
{strmatch, TokenStr, Rest} ->
|
||||
Token = #tk{shape = ct, pos = Pos, str = TokenStr},
|
||||
{tokmatch, Token, Rest}
|
||||
end;
|
||||
slurp_token_of_shape(sg, Pos, SrcStr) ->
|
||||
StringMatcher = gs_strmatch:smr_sf_sg(),
|
||||
case gs_strmatch:match(StringMatcher, SrcStr) of
|
||||
no_strmatch ->
|
||||
no_tokmatch;
|
||||
{strmatch, TokenStr, Rest} ->
|
||||
Token = #tk{shape = sg, pos = Pos, str = TokenStr},
|
||||
{tokmatch, Token, Rest}
|
||||
end;
|
||||
slurp_token_of_shape(char, Pos, SrcStr) ->
|
||||
StringMatcher = gs_strmatch:smr_sf_char(),
|
||||
case gs_strmatch:match(StringMatcher, SrcStr) of
|
||||
no_strmatch ->
|
||||
no_tokmatch;
|
||||
{strmatch, TokenStr, Rest} ->
|
||||
Token = #tk{shape = char, pos = Pos, str = TokenStr},
|
||||
{tokmatch, Token, Rest}
|
||||
end;
|
||||
slurp_token_of_shape(string, Pos, SrcStr) ->
|
||||
case gs_strmatch:match(gs_strmatch:smr_sf_str(), SrcStr) of
|
||||
no_strmatch ->
|
||||
no_tokmatch;
|
||||
{strmatch, TokenStr, Rest} ->
|
||||
Token = #tk{shape = string, pos = Pos, str = TokenStr},
|
||||
{tokmatch, Token, Rest}
|
||||
end;
|
||||
slurp_token_of_shape(bytes, Pos, SrcStr) ->
|
||||
case gs_strmatch:match(gs_strmatch:smr_sf_bytes(), SrcStr) of
|
||||
no_strmatch ->
|
||||
no_tokmatch;
|
||||
{strmatch, TokenStr, Rest} ->
|
||||
Token = #tk{shape = bytes, pos = Pos, str = TokenStr},
|
||||
{tokmatch, Token, Rest}
|
||||
end;
|
||||
slurp_token_of_shape(NyiType, Pos, SrcStr) ->
|
||||
Message = io_lib:format("cannot slurp token of shape: ~p", [NyiType]),
|
||||
error(#gsc_err{atom = nyi,
|
||||
str = Message,
|
||||
extra = [{token_shape, NyiType},
|
||||
{pos, Pos},
|
||||
{rest, SrcStr}]}).
|
||||
|
||||
|
||||
|
||||
takeline(Acc, "") -> {lists:reverse(Acc), ""};
|
||||
takeline(Acc, Rest = "\n" ++ _) -> {lists:reverse(Acc), Rest};
|
||||
takeline(Acc, [C | Rest]) -> takeline([C | Acc], Rest).
|
||||
|
||||
|
||||
bcom(CommentStr, Depth, SrcStr0) when Depth > 0 ->
|
||||
case SrcStr0 of
|
||||
% premature end
|
||||
"" ->
|
||||
{ierr, unterminated_block_comment};
|
||||
% decrease depth
|
||||
"*/" ++ SrcStr1 ->
|
||||
NewCommentStr = [CommentStr, "*/"],
|
||||
NewDepth = Depth - 1,
|
||||
bcom(NewCommentStr, NewDepth, SrcStr1);
|
||||
% increase depth
|
||||
"/*" ++ SrcStr1 ->
|
||||
NewCommentStr = [CommentStr, "/*"],
|
||||
NewDepth = Depth + 1,
|
||||
bcom(NewCommentStr, NewDepth, SrcStr1);
|
||||
% same depth, add to list
|
||||
[C | SrcStr1] ->
|
||||
NewCommentStr = [CommentStr, C],
|
||||
bcom(NewCommentStr, Depth, SrcStr1)
|
||||
end;
|
||||
bcom(CommentStr, 0, SrcStr) ->
|
||||
{ok, unicode:characters_to_nfc_list(CommentStr), SrcStr}.
|
||||
|
||||
|
||||
%------------------------------------------
|
||||
% INTERNAL UTILITIES
|
||||
%------------------------------------------
|
||||
|
||||
-spec take_while(Pred, List) -> {Taken, Rest}
|
||||
when Pred :: fun((Item) -> boolean()),
|
||||
List :: [Item],
|
||||
Taken :: List,
|
||||
Rest :: List.
|
||||
% @doc similar to lists:takewhile but returns {Taken, Rest}. Name is
|
||||
% to remind you it returns 2 things.
|
||||
|
||||
take_while(Pred, List) ->
|
||||
take_while(Pred, [], List).
|
||||
|
||||
|
||||
-spec take_while(Pred, Prefix, List) -> {Taken, Rest}
|
||||
when Pred :: fun((Item) -> boolean()),
|
||||
Prefix :: List,
|
||||
List :: [Item],
|
||||
Taken :: List,
|
||||
Rest :: List.
|
||||
% @doc
|
||||
% similar to takewhile_ii/2, but returns {Prefix ++ Taken, Rest}
|
||||
%
|
||||
% where Prefix
|
||||
%
|
||||
% middle argument is just the accum
|
||||
take_while(Pred, Pfx, List) ->
|
||||
tw3(Pred, lists:reverse(Pfx), List).
|
||||
|
||||
|
||||
tw3(Pred, Stk, [X | Xs]) ->
|
||||
case Pred(X) of
|
||||
true -> tw3(Pred, [X | Stk], Xs);
|
||||
false -> {lists:reverse(Stk), [X | Xs]}
|
||||
end;
|
||||
tw3(_, Stk, []) ->
|
||||
{lists:reverse(Stk), []}.
|
||||
Reference in New Issue
Block a user