803 lines
25 KiB
Erlang
803 lines
25 KiB
Erlang
% @doc
|
|
% Ref: so_scan.erl
|
|
%
|
|
% This file contains a sophia tokenizer written in straightforward erlang with data
|
|
% types that are sane.
|
|
%
|
|
% For MVP it mimics the behavior of so_scan exactly, in terms of like what its
|
|
% definition of a token is and so on.
|
|
%
|
|
% gsc_so_scan.erl contains a compatibility layer that should agree with so_scan
|
|
% exactly. It converts the data types here to the shapes that so_scan outputs.
|
|
%
|
|
% This is for two reasons:
|
|
%
|
|
% 1. in order to enable testing the two modules against each other, and
|
|
% 2. to future-proof in case we decide to incrementally incorporate the gsc
|
|
% code into the legacy sophia compiler
|
|
% @end
|
|
-module(gs_tokens).
|
|
|
|
% meta
|
|
-export([
|
|
token_shapes_parse_order/0,
|
|
kwds/0
|
|
]).
|
|
|
|
-export([
|
|
take_while/2,
|
|
take_while/3,
|
|
take_block/1,
|
|
take_block_item/1,
|
|
strings/2,
|
|
slurp_plist/1
|
|
]).
|
|
|
|
% token slurping
|
|
-export([
|
|
indent_level/1,
|
|
is_significant/1,
|
|
filter_significant/1,
|
|
significant_tokens/1,
|
|
tokens_from_iolist/1,
|
|
tokens/1,
|
|
slurp_token/2,
|
|
slurp_token_shapes/3,
|
|
slurp_token_of_shape/3,
|
|
new_pos/2
|
|
]).
|
|
|
|
-include("$gsc_include/gsc.hrl").
|
|
|
|
|
|
%=======================================================
|
|
% API: functions
|
|
%=======================================================
|
|
|
|
-spec strings(N, Tokens) -> AtMostNStrings
|
|
when N :: non_neg_integer(),
|
|
Tokens :: [tk()],
|
|
AtMostNStrings :: [string()].
|
|
% @doc return the strings of the first N tokens
|
|
|
|
strings(N, [#tk{str = S} | Rest]) when is_integer(N), N >= 1 ->
|
|
[S | strings(N-1, Rest)];
|
|
strings(_, []) ->
|
|
[];
|
|
strings(0, _) ->
|
|
[].
|
|
|
|
|
|
% used by parser
|
|
%
|
|
% a block is a column-delimited list of block items
|
|
%
|
|
% BLOCK =
|
|
% foo
|
|
% ...
|
|
% bar
|
|
% ...
|
|
% baz
|
|
% ...
|
|
%
|
|
% BLOCK_ITEM =
|
|
% foo
|
|
% ...
|
|
|
|
-spec take_block(Tokens) -> {BlockTokens, Rest}
|
|
when Tokens :: [tk()],
|
|
BlockTokens :: Tokens,
|
|
Rest :: Tokens.
|
|
% @doc
|
|
% takes all tokens whose column position is >= the column position of
|
|
% the head token
|
|
|
|
take_block([H = #tk{pos = {_, BlkCol}} | T]) ->
|
|
TokenInBlock =
|
|
fun(#tk{pos = {_, TkCol}}) ->
|
|
BlkCol =< TkCol
|
|
end,
|
|
take_while(TokenInBlock, [H], T);
|
|
take_block([]) ->
|
|
{[], []}.
|
|
|
|
|
|
|
|
-spec take_block_item(Tokens) -> {ItemTokens, Rest}
|
|
when Tokens :: [tk()],
|
|
ItemTokens :: Tokens,
|
|
Rest :: Tokens.
|
|
% @doc
|
|
% takes all tokens whose column position is > the column position of
|
|
% the head token
|
|
|
|
take_block_item([H = #tk{pos = {_, ItemCol}} | T]) ->
|
|
TokenInItem =
|
|
fun(#tk{pos = {_, TkCol}}) ->
|
|
ItemCol < TkCol
|
|
end,
|
|
take_while(TokenInItem, [H], T);
|
|
take_block_item([]) ->
|
|
{[], []}.
|
|
|
|
|
|
|
|
-spec slurp_plist(Tokens) -> Result
|
|
when Tokens :: [Token],
|
|
Result :: {slurp, PList :: Tokens, After :: Tokens}
|
|
| {error, Mismatch},
|
|
Mismatch :: {fixme, mismatch, OpenStack, ClosedBy},
|
|
OpenStack :: Tokens,
|
|
ClosedBy :: none | {value, Token},
|
|
Token :: tk().
|
|
|
|
% @doc
|
|
% the verbiage here is `slurp' rather than `take' because we insist on
|
|
% delimiter matching.
|
|
%
|
|
% typical happy path:
|
|
% "(foo, bar) baz" ~> {slurp, "(foo, bar)", "baz"}
|
|
% "() baz" ~> {slurp, "()", "baz"}
|
|
% "foo () baz" ~> {slurp, "", "foo () baz"}
|
|
% "(foo, bar) baz" ~> {slurp, "(foo, bar)", "baz"}
|
|
%
|
|
% typical sad path:
|
|
% "(foo, bar]" ~> {mismatch, ["("], {value, "]"}}
|
|
% "(foo, bar" ~> {mismatch, ["("], none}
|
|
% "([foo, bar)" ~> {mismatch, ["[", "("], {value, ")"}}
|
|
%
|
|
% counterintuitive:
|
|
% "[foo, bar) baz" ~> {slurp, "", "[foo, bar) baz"}
|
|
% "~!!\inv4l1d syntax" ~> {slurp, "", "~!!\inv4l1d syntax"}
|
|
% "(foo, bar)(baz)" ~> {slurp, "(foo bar)", "(baz)"}
|
|
%
|
|
% the only "syntax checking" occurring is making sure the delimiter
|
|
% stack pushes and pops properly
|
|
%
|
|
% please note that on mismatch, the list of open delimiters is
|
|
% returned in STACK order, meaning the most recent open delimiters
|
|
% first. this is more convenient for programs, but might be
|
|
% counterintuitive to end-users (who are programmers, entirely
|
|
% unfamiliar with notions like stacks and open/close delimiters)
|
|
|
|
slurp_plist([Hd = #tk{str = "("} | Tl]) ->
|
|
slurp_dlist([Hd], [Hd], Tl);
|
|
slurp_plist(Tks) ->
|
|
{slurp, [], Tks}.
|
|
|
|
|
|
% happy terminal case: stack popped entirely
|
|
slurp_dlist(All, [], NewTokens) ->
|
|
{slurp, lists:reverse(All), NewTokens};
|
|
% WMA stack is nonempty
|
|
% happy cases of opens getting popped
|
|
slurp_dlist(All, [#tk{str = "("} | NewOpen],
|
|
[#tk{str = ")"} = Tk | NewTks]) ->
|
|
slurp_dlist([Tk | All], NewOpen, NewTks);
|
|
slurp_dlist(All, [#tk{str = "["} | NewOpen],
|
|
[#tk{str = "]"} = Tk | NewTks]) ->
|
|
slurp_dlist([Tk | All], NewOpen, NewTks);
|
|
slurp_dlist(All, [#tk{str = "{"} | NewOpen],
|
|
[#tk{str = "}"} = Tk | NewTks]) ->
|
|
slurp_dlist([Tk | All], NewOpen, NewTks);
|
|
% happy: open delimiters getting pushed
|
|
slurp_dlist(All, Opens, [#tk{str = "("} = Tk | NewTks]) ->
|
|
slurp_dlist([Tk | All], [Tk | Opens], NewTks);
|
|
slurp_dlist(All, Opens, [#tk{str = "["} = Tk | NewTks]) ->
|
|
slurp_dlist([Tk | All], [Tk | Opens], NewTks);
|
|
slurp_dlist(All, Opens, [#tk{str = "{"} = Tk | NewTks]) ->
|
|
slurp_dlist([Tk | All], [Tk | Opens], NewTks);
|
|
% sad: mismatch cases
|
|
slurp_dlist(All, Opens, []) ->
|
|
{error, {fixme, mismatch, Opens, none}};
|
|
slurp_dlist(All, Opens, [#tk{str = "}"} = BadClose | _]) ->
|
|
{error, {fixme, mismatch, Opens, {value, BadClose}}};
|
|
slurp_dlist(All, Opens, [#tk{str = "]"} = BadClose | _]) ->
|
|
{error, {fixme, mismatch, Opens, {value, BadClose}}};
|
|
slurp_dlist(All, Opens, [#tk{str = ")"} = BadClose | _]) ->
|
|
{error, {fixme, mismatch, Opens, {value, BadClose}}};
|
|
% general case: non-terminal token gets pushed
|
|
slurp_dlist(All, Opens, [Tk | NewTks]) ->
|
|
slurp_dlist([Tk | All], Opens, NewTks).
|
|
|
|
|
|
%-------------------------------------------------------
|
|
% API: meta info
|
|
%
|
|
% This is parse order definition, list of keywords, etc
|
|
%
|
|
% -export([
|
|
% token_shapes_parse_order/0,
|
|
% kwds/0
|
|
% ]).
|
|
%-------------------------------------------------------
|
|
|
|
-spec token_shapes_parse_order() -> [tk_shape()].
|
|
% @doc
|
|
% list of sophia token shapes in parse order (if an earlier shape matches, the later
|
|
% shape isn't even checked)
|
|
%
|
|
%
|
|
% Rules =
|
|
% %% Comments and whitespace
|
|
% [ CommentStart
|
|
% , {"//.*", skip()}
|
|
% , {WS, skip()}
|
|
%
|
|
% %% Special characters
|
|
% , {"\\.\\.|[,.;()\\[\\]{}]", symbol()}
|
|
%
|
|
% %% Literals
|
|
% , {CHAR, token(char, fun parse_char/1)}
|
|
% , {STRING, token(string, fun parse_string/1)}
|
|
% , {HEX, token(hex, fun parse_hex/1)}
|
|
% , {INT, token(int, fun parse_int/1)}
|
|
% , {BYTES, token(bytes, fun parse_bytes/1)}
|
|
%
|
|
% %% Identifiers (qualified first!)
|
|
% , {QID, token(qid, fun(S) -> string:tokens(S, ".") end)}
|
|
% , {QCON, token(qcon, fun(S) -> string:tokens(S, ".") end)}
|
|
% , {TVAR, token(tvar)}
|
|
% , override({ID, token(id)}, {KW, symbol()}) %% Keywords override identifiers. Need to
|
|
% , {CON, token(con)} %% use override to avoid lexing "lettuce"
|
|
% %% as ['let', {id, "tuce"}].
|
|
% %% Operators
|
|
% , {OP, symbol()}
|
|
% ],
|
|
% @end
|
|
|
|
token_shapes_parse_order() ->
|
|
% written in this style to be maximally editable
|
|
lists:flatten([
|
|
% comments and whitespace
|
|
lcom, bcom, ws,
|
|
punct,
|
|
% literals
|
|
char, string, int16, int10, bytes,
|
|
ak, ct, sg,
|
|
% qualified names need to go ahead of unqualifieds
|
|
qid, qcon,
|
|
tvar,
|
|
% keywords need to be parsed ahead of ids
|
|
kwd, id,
|
|
con,
|
|
% ops [=, =>, >>], punctuation (parens/braces)
|
|
op
|
|
]).
|
|
|
|
|
|
-spec kwds() -> list(string()).
|
|
% @doc list of sophia kwds
|
|
|
|
kwds() ->
|
|
["contract", "include", "let", "switch", "type", "record", "datatype",
|
|
"if", "elif", "else", "function", "stateful", "payable", "true", "false",
|
|
"mod", "public", "entrypoint", "private", "indexed", "namespace",
|
|
"interface", "main", "using", "as", "for", "hiding", "band", "bor",
|
|
"bxor", "bnot"].
|
|
|
|
|
|
%-------------------------------------------------------
|
|
% API: token slurping
|
|
%
|
|
% -export([
|
|
% tokens/1,
|
|
% slurp_token/1,
|
|
% slurp_token_shapes/2,
|
|
% slurp_token_of_shape/2
|
|
% ]).
|
|
%-------------------------------------------------------
|
|
|
|
% Token accessors
|
|
-spec indent_level(tk()) -> pos_integer().
|
|
|
|
indent_level(#tk{pos = {_, IndentLevel}}) ->
|
|
IndentLevel.
|
|
|
|
|
|
|
|
-spec significant_tokens(SrcStr) -> Result
|
|
when SrcStr :: iolist(),
|
|
Result :: {ok, Tokens}
|
|
| {error, gsc_err()},
|
|
Tokens :: [tk()].
|
|
|
|
significant_tokens(SrcStr) ->
|
|
case tokens(SrcStr) of
|
|
{ok, Tokens} ->
|
|
{ok, filter_significant(Tokens)};
|
|
Error ->
|
|
Error
|
|
end.
|
|
|
|
|
|
|
|
-spec filter_significant(Tokens) -> SignificantTokens
|
|
when Tokens :: [tk()],
|
|
SignificantTokens :: Tokens.
|
|
|
|
filter_significant(Tokens) ->
|
|
lists:filter(fun is_significant/1, Tokens).
|
|
|
|
|
|
|
|
-spec is_significant(Token) -> boolean()
|
|
when Token :: tk().
|
|
|
|
is_significant(#tk{shape = bcom}) -> false;
|
|
is_significant(#tk{shape = lcom}) -> false;
|
|
is_significant(#tk{shape = ws}) -> false;
|
|
is_significant(_) -> true.
|
|
|
|
|
|
-spec tokens_from_iolist(SrcStr) -> Result when
|
|
SrcStr :: iolist(),
|
|
Result :: {ok, Tokens}
|
|
| {error, gsc_err()},
|
|
Tokens :: [tk()].
|
|
|
|
% @doc alias for tokens/1
|
|
tokens_from_iolist(S) -> tokens(S).
|
|
|
|
|
|
|
|
-spec tokens(SrcStr) -> Result
|
|
when SrcStr :: iolist(),
|
|
Result :: {ok, Tokens}
|
|
| {error, gsc_err()},
|
|
Tokens :: [tk()].
|
|
% @doc
|
|
% Recursively parse all tokens off the front end of the string. `Rest' is
|
|
% the first tail of the string for which no token parser succeeds.
|
|
%
|
|
% Semantically, `Rest'` being nonempty amounts to the presence of an illegal
|
|
% character.
|
|
|
|
tokens(S) ->
|
|
% defensive normalization
|
|
tokens([], {1, 1}, unicode:characters_to_nfc_list(S)).
|
|
|
|
tokens(Stack, _FinalPos, "") ->
|
|
{ok, lists:reverse(Stack)};
|
|
tokens(Stack, Pos, SrcStr) ->
|
|
case slurp_token(Pos, SrcStr) of
|
|
{tokmatch, NewToken = #tk{str = TokStr},
|
|
NewSrcStr} ->
|
|
NewPos = new_pos(Pos, TokStr),
|
|
tokens([NewToken | Stack], NewPos, NewSrcStr);
|
|
no_tokmatch ->
|
|
PrevTokens = lists:reverse(Stack),
|
|
Err = #gsc_err_no_tokmatch{prev_tokens = PrevTokens,
|
|
break_pos = Pos,
|
|
rest = SrcStr},
|
|
{error, Err};
|
|
% FIXME so_scan bad
|
|
% this is so fucking stupid
|
|
% so_scan for some reason allows unterminated block comments at
|
|
% the end of files
|
|
%
|
|
% for now we're just going to agree with so_scan
|
|
{ierr, unterminated_block_comment} ->
|
|
PrevTokens = lists:reverse(Stack),
|
|
Err = #gsc_err_bcom_unterminated{prev_tokens = PrevTokens,
|
|
break_pos = Pos,
|
|
rest = SrcStr},
|
|
{error, Err};
|
|
Error = {error, _} ->
|
|
Error
|
|
end.
|
|
|
|
% alright some bullshit here
|
|
%
|
|
% we're computing the line/column position of each string
|
|
%
|
|
% however this is meant to be compatible with so_scan, so it's a bit wonky
|
|
% because regex list bullshit.
|
|
%
|
|
% recall that so_scan operates on the list representation of the utf-8 encoded
|
|
% bytes; this is different than on a list of bignum codepoints (e.g.
|
|
% unicode:characters_to_nfc_list(Bytes)); let's suppose some stupid complicated
|
|
% foreign character which a sane language would simply criminalize has list
|
|
% representation [ABC], but byte representation <<A,B,C>>
|
|
%
|
|
% as far as so_scan is concerned, this means the character ABC consumes 3
|
|
% columns. the only exception is tab characters, which always fast-forward to
|
|
% the next tab stop, which is 1-indexed because god hates all of us
|
|
%
|
|
% so the tab-stops are
|
|
% 1 9 17 25 33 ...
|
|
%
|
|
% column position is determined in all cases by byte order, EXCEPT for $\t
|
|
% which goes to the next tab stop
|
|
%
|
|
% so in general, for the token string, we need to convert to bytes first,
|
|
% then handle `\t` bytes as a special case
|
|
%
|
|
% again in the tokenizer context, we're assuming that the input to our
|
|
% tokenizer is an nfc-list which has a flat list of each unicode character in
|
|
% codepoint form
|
|
%
|
|
% here we're just converting it to byte form, then computing columns based on
|
|
% bytes
|
|
new_pos(OldPos, TokStr) ->
|
|
new_pos_bytes(unicode:characters_to_binary(TokStr), OldPos).
|
|
|
|
% newline just goes to {L+1, 1}
|
|
new_pos_bytes(<<$\n:8, Rest/bytes>>, _Pos = {L, _}) ->
|
|
NewPos = {L+1, 1},
|
|
new_pos_bytes(Rest, NewPos);
|
|
new_pos_bytes(<<$\t:8, Rest/bytes>>, _Pos = {Linum, Colnum1}) ->
|
|
% stinky wet sweaty robots need 1-based indexing
|
|
% so tab stops are at
|
|
% 1 9 17 25
|
|
% super awesome dry silicon robots use 0-based indexing
|
|
% so tab stops are at
|
|
% 0 8 16 25
|
|
Colnum0 = Colnum1 - 1,
|
|
% 0 based is based
|
|
NextTabstop0 = next_tabstop8(Colnum0),
|
|
NextTabstop1 = NextTabstop0 + 1,
|
|
NextPos = {Linum, NextTabstop1},
|
|
new_pos_bytes(Rest, NextPos);
|
|
new_pos_bytes(<<_:8, Rest/bytes>>, _Pos = {Linum, Colnum1}) ->
|
|
% in general advance by 1
|
|
new_pos_bytes(Rest, {Linum, Colnum1 + 1});
|
|
new_pos_bytes(<<>>, FinalPos) ->
|
|
FinalPos.
|
|
|
|
% 0 8 16 24 etc
|
|
% 0*8 1*8 2*8 3*8 etc
|
|
next_tabstop8(Col0) when Col0 >= 0 ->
|
|
% Col0 = PrevTabQ*8 + PrevTabR
|
|
PrevTabQ = Col0 div 8,
|
|
PrevTabR = Col0 rem 8,
|
|
Col0 = PrevTabQ*8 + PrevTabR,
|
|
NextTabQ = PrevTabQ + 1,
|
|
NextTabCol0 = NextTabQ*8,
|
|
NextTabCol0.
|
|
|
|
%% copied from so_scan_lib.erl just to match behavior
|
|
%-define(TAB_SIZE, 8).
|
|
%
|
|
%next_pos([], P) -> P;
|
|
%next_pos([$\n | S], {L, _}) -> next_pos(S, {L + 1, 1});
|
|
%next_pos([$\t | S], {L, C}) -> next_pos(S, {L, (C + ?TAB_SIZE - 1) div ?TAB_SIZE * ?TAB_SIZE + 1});
|
|
%next_pos([_ | S], {L, C}) -> next_pos(S, {L, C + 1}).
|
|
|
|
|
|
|
|
-spec slurp_token(Pos, SrcStr) -> Result
|
|
when Pos :: tk_pos(),
|
|
SrcStr :: string(),
|
|
Result :: {tokmatch, Token, Rest}
|
|
| no_tokmatch
|
|
| {error, gsc_err()}
|
|
| {ierr, unterminated_block_comment},
|
|
Token :: tk(),
|
|
Rest :: string().
|
|
% @doc
|
|
% grab a single token off the front of the string according to
|
|
% `token_shapes_parse_order/0'
|
|
|
|
slurp_token(Pos, SrcStr) ->
|
|
% this is the easiest format if i need to fuck with it
|
|
slurp_token_shapes(token_shapes_parse_order(), Pos, SrcStr).
|
|
|
|
|
|
|
|
-spec slurp_token_shapes(ParseOrder, Pos, SrcStr) -> Result
|
|
when ParseOrder :: [tk_shape()],
|
|
Pos :: tk_pos(),
|
|
SrcStr :: string(),
|
|
Result :: {tokmatch, Token, Rest}
|
|
| no_tokmatch
|
|
| {error, gsc_err()}
|
|
| {ierr, unterminated_block_comment},
|
|
Token :: tk(),
|
|
Rest :: string().
|
|
% @doc
|
|
% grab a single token off the front of the string according to
|
|
% `token_shapes_parse_order/0'
|
|
|
|
slurp_token_shapes([TokenType | TTs], Pos, SrcStr) ->
|
|
case slurp_token_of_shape(TokenType, Pos, SrcStr) of
|
|
Match = {tokmatch, _, _} -> Match;
|
|
no_tokmatch -> slurp_token_shapes(TTs, Pos, SrcStr);
|
|
IErr = {ierr, _} -> IErr;
|
|
Error = {error, _} -> Error
|
|
end;
|
|
slurp_token_shapes([], _Pos, _SrcStr) ->
|
|
no_tokmatch.
|
|
|
|
|
|
-spec slurp_token_of_shape(TokenType, Pos, SrcStr) -> MaybeToken
|
|
when TokenType :: tk_shape(),
|
|
Pos :: tk_pos(),
|
|
SrcStr :: string(),
|
|
MaybeToken :: {tokmatch, Token, Rest}
|
|
| no_tokmatch
|
|
| {error, gsc_err()}
|
|
| {ierr, unterminated_block_comment},
|
|
Token :: tk(),
|
|
Rest :: string().
|
|
% @doc
|
|
% match a sophia token of a given shape off the front of the string
|
|
% @end
|
|
|
|
% COMMENTS AND WHITESPACE: lcom, bcom, ws
|
|
%
|
|
% sophia line comment
|
|
%
|
|
% i am not going to bother writing a string matcher thing for this
|
|
% FIXME: make a string matcher for line comments
|
|
slurp_token_of_shape(lcom, Pos, SrcStr) ->
|
|
case SrcStr of
|
|
"//" ++ _ ->
|
|
{Line, Rest} = takeline("", SrcStr),
|
|
Token = #tk{shape = lcom,
|
|
pos = Pos,
|
|
str = Line},
|
|
{tokmatch, Token, Rest};
|
|
_ ->
|
|
no_tokmatch
|
|
end;
|
|
% Block comments cannot have a string matcher because they have a whole stack
|
|
% thing keeping track of depth because of nested block comments
|
|
slurp_token_of_shape(bcom, Pos, SrcStr0) ->
|
|
case SrcStr0 of
|
|
"/*" ++ SrcStr1 ->
|
|
case bcom("/*", 1, SrcStr1) of
|
|
{ok, CommentStr, SrcStr2} ->
|
|
Token = #tk{shape = bcom,
|
|
pos = Pos,
|
|
str = CommentStr},
|
|
{tokmatch, Token, SrcStr2};
|
|
Error ->
|
|
Error
|
|
end;
|
|
_ ->
|
|
no_tokmatch
|
|
end;
|
|
slurp_token_of_shape(ws, Pos, SrcStr) ->
|
|
WhitespaceMatcher = gs_strmatch:smr_sf_ws(),
|
|
case gs_strmatch:match(WhitespaceMatcher, SrcStr) of
|
|
no_strmatch ->
|
|
no_tokmatch;
|
|
{strmatch, WS, Rest} ->
|
|
Token = #tk{shape = ws,
|
|
pos = Pos,
|
|
str = WS},
|
|
{tokmatch, Token, Rest}
|
|
end;
|
|
% KEYWORDS, OPERATORS, PUNCTUATION: kwd, op, punct
|
|
%
|
|
% all the kwds are valid ids, so we match as an id and then check if it's a
|
|
% kwd
|
|
%
|
|
% kwds are allowed to be prefixes for user-defined variable names; e.g.
|
|
% "lettuce" should be parsed as an id, not as ["let", "tuce"]; for this reason
|
|
% we need to be careful with greedily parsing kwds
|
|
%
|
|
% we know kwds are always ids, so we parse it as an id and see if it's one
|
|
% of the kwds
|
|
slurp_token_of_shape(kwd, Pos, SrcStr) ->
|
|
case slurp_token_of_shape(id, Pos, SrcStr) of
|
|
{tokmatch, IdTok = #tk{str = IdStr}, Rest} ->
|
|
case lists:member(IdStr, kwds()) of
|
|
false ->
|
|
no_tokmatch;
|
|
true ->
|
|
KwTok = IdTok#tk{shape = kwd},
|
|
{tokmatch, KwTok, Rest}
|
|
end;
|
|
no_tokmatch ->
|
|
no_tokmatch
|
|
end;
|
|
slurp_token_of_shape(op, Pos, SrcStr) ->
|
|
case gs_strmatch:match(gs_strmatch:smr_sf_op(), SrcStr) of
|
|
{strmatch, Str, Rest} ->
|
|
Token = #tk{shape = op, pos = Pos, str = Str},
|
|
{tokmatch, Token, Rest};
|
|
no_strmatch ->
|
|
no_tokmatch
|
|
end;
|
|
slurp_token_of_shape(punct, Pos, SrcStr) ->
|
|
case gs_strmatch:match(gs_strmatch:smr_sf_punct(), SrcStr) of
|
|
{strmatch, Str, Rest} ->
|
|
Token = #tk{shape = punct, pos = Pos, str = Str},
|
|
{tokmatch, Token, Rest};
|
|
no_strmatch ->
|
|
no_tokmatch
|
|
end;
|
|
% SOPHIA VARIABLE NAMES: id, con, qid, qcon, tvar
|
|
slurp_token_of_shape(id, Pos, SrcStr) ->
|
|
case gs_strmatch:match(gs_strmatch:smr_sf_id(), SrcStr) of
|
|
{strmatch, IdStr, Rest} ->
|
|
Token = #tk{shape = id, pos = Pos, str = IdStr},
|
|
{tokmatch, Token, Rest};
|
|
no_strmatch ->
|
|
no_tokmatch
|
|
end;
|
|
slurp_token_of_shape(con, Pos, SrcStr) ->
|
|
case gs_strmatch:match(gs_strmatch:smr_sf_con(), SrcStr) of
|
|
{strmatch, Str, Rest} ->
|
|
Token = #tk{shape = con, pos = Pos, str = Str},
|
|
{tokmatch, Token, Rest};
|
|
no_strmatch ->
|
|
no_tokmatch
|
|
end;
|
|
slurp_token_of_shape(qid, Pos, SrcStr) ->
|
|
case gs_strmatch:match(gs_strmatch:smr_sf_qid(), SrcStr) of
|
|
{strmatch, Str, Rest} ->
|
|
Token = #tk{shape = qid, pos = Pos, str = Str},
|
|
{tokmatch, Token, Rest};
|
|
no_strmatch ->
|
|
no_tokmatch
|
|
end;
|
|
slurp_token_of_shape(qcon, Pos, SrcStr) ->
|
|
case gs_strmatch:match(gs_strmatch:smr_sf_qcon(), SrcStr) of
|
|
{strmatch, Str, Rest} ->
|
|
Token = #tk{shape = qcon, pos = Pos, str = Str},
|
|
{tokmatch, Token, Rest};
|
|
no_strmatch ->
|
|
no_tokmatch
|
|
end;
|
|
slurp_token_of_shape(tvar, Pos, SrcStr) ->
|
|
case gs_strmatch:match(gs_strmatch:smr_sf_tvar(), SrcStr) of
|
|
{strmatch, Str, Rest} ->
|
|
Token = #tk{shape = tvar, pos = Pos, str = Str},
|
|
{tokmatch, Token, Rest};
|
|
no_strmatch ->
|
|
no_tokmatch
|
|
end;
|
|
slurp_token_of_shape(int16, Pos, SrcStr) ->
|
|
case gs_strmatch:match(gs_strmatch:smr_sf_int16(), SrcStr) of
|
|
{strmatch, Str, Rest} ->
|
|
Token = #tk{shape = int16, pos = Pos, str = Str},
|
|
{tokmatch, Token, Rest};
|
|
no_strmatch ->
|
|
no_tokmatch
|
|
end;
|
|
slurp_token_of_shape(int10, Pos, SrcStr) ->
|
|
case gs_strmatch:match(gs_strmatch:smr_sf_int10(), SrcStr) of
|
|
{strmatch, Str, Rest} ->
|
|
Token = #tk{shape = int10, pos = Pos, str = Str},
|
|
{tokmatch, Token, Rest};
|
|
no_strmatch ->
|
|
no_tokmatch
|
|
end;
|
|
% LITERAL PARSERS: char, string, hex, int, bytes10, bytes16,
|
|
% ak, ct, sg
|
|
%
|
|
% char: sophia char literal
|
|
slurp_token_of_shape(ak, Pos, SrcStr) ->
|
|
StringMatcher = gs_strmatch:smr_sf_ak(),
|
|
case gs_strmatch:match(StringMatcher, SrcStr) of
|
|
no_strmatch ->
|
|
no_tokmatch;
|
|
{strmatch, TokenStr, Rest} ->
|
|
Token = #tk{shape = ak, pos = Pos, str = TokenStr},
|
|
{tokmatch, Token, Rest}
|
|
end;
|
|
slurp_token_of_shape(ct, Pos, SrcStr) ->
|
|
StringMatcher = gs_strmatch:smr_sf_ct(),
|
|
case gs_strmatch:match(StringMatcher, SrcStr) of
|
|
no_strmatch ->
|
|
no_tokmatch;
|
|
{strmatch, TokenStr, Rest} ->
|
|
Token = #tk{shape = ct, pos = Pos, str = TokenStr},
|
|
{tokmatch, Token, Rest}
|
|
end;
|
|
slurp_token_of_shape(sg, Pos, SrcStr) ->
|
|
StringMatcher = gs_strmatch:smr_sf_sg(),
|
|
case gs_strmatch:match(StringMatcher, SrcStr) of
|
|
no_strmatch ->
|
|
no_tokmatch;
|
|
{strmatch, TokenStr, Rest} ->
|
|
Token = #tk{shape = sg, pos = Pos, str = TokenStr},
|
|
{tokmatch, Token, Rest}
|
|
end;
|
|
slurp_token_of_shape(char, Pos, SrcStr) ->
|
|
StringMatcher = gs_strmatch:smr_sf_char(),
|
|
case gs_strmatch:match(StringMatcher, SrcStr) of
|
|
no_strmatch ->
|
|
no_tokmatch;
|
|
{strmatch, TokenStr, Rest} ->
|
|
Token = #tk{shape = char, pos = Pos, str = TokenStr},
|
|
{tokmatch, Token, Rest}
|
|
end;
|
|
slurp_token_of_shape(string, Pos, SrcStr) ->
|
|
case gs_strmatch:match(gs_strmatch:smr_sf_str(), SrcStr) of
|
|
no_strmatch ->
|
|
no_tokmatch;
|
|
{strmatch, TokenStr, Rest} ->
|
|
Token = #tk{shape = string, pos = Pos, str = TokenStr},
|
|
{tokmatch, Token, Rest}
|
|
end;
|
|
slurp_token_of_shape(bytes, Pos, SrcStr) ->
|
|
case gs_strmatch:match(gs_strmatch:smr_sf_bytes(), SrcStr) of
|
|
no_strmatch ->
|
|
no_tokmatch;
|
|
{strmatch, TokenStr, Rest} ->
|
|
Token = #tk{shape = bytes, pos = Pos, str = TokenStr},
|
|
{tokmatch, Token, Rest}
|
|
end;
|
|
slurp_token_of_shape(NyiType, Pos, SrcStr) ->
|
|
Message = io_lib:format("cannot slurp token of shape: ~p", [NyiType]),
|
|
error(#gsc_err{atom = nyi,
|
|
str = Message,
|
|
extra = [{token_shape, NyiType},
|
|
{pos, Pos},
|
|
{rest, SrcStr}]}).
|
|
|
|
|
|
|
|
takeline(Acc, "") -> {lists:reverse(Acc), ""};
|
|
takeline(Acc, Rest = "\n" ++ _) -> {lists:reverse(Acc), Rest};
|
|
takeline(Acc, [C | Rest]) -> takeline([C | Acc], Rest).
|
|
|
|
|
|
bcom(CommentStr, Depth, SrcStr0) when Depth > 0 ->
|
|
case SrcStr0 of
|
|
% premature end
|
|
"" ->
|
|
{ierr, unterminated_block_comment};
|
|
% decrease depth
|
|
"*/" ++ SrcStr1 ->
|
|
NewCommentStr = [CommentStr, "*/"],
|
|
NewDepth = Depth - 1,
|
|
bcom(NewCommentStr, NewDepth, SrcStr1);
|
|
% increase depth
|
|
"/*" ++ SrcStr1 ->
|
|
NewCommentStr = [CommentStr, "/*"],
|
|
NewDepth = Depth + 1,
|
|
bcom(NewCommentStr, NewDepth, SrcStr1);
|
|
% same depth, add to list
|
|
[C | SrcStr1] ->
|
|
NewCommentStr = [CommentStr, C],
|
|
bcom(NewCommentStr, Depth, SrcStr1)
|
|
end;
|
|
bcom(CommentStr, 0, SrcStr) ->
|
|
{ok, unicode:characters_to_nfc_list(CommentStr), SrcStr}.
|
|
|
|
|
|
%------------------------------------------
|
|
% INTERNAL UTILITIES
|
|
%------------------------------------------
|
|
|
|
-spec take_while(Pred, List) -> {Taken, Rest}
|
|
when Pred :: fun((Item) -> boolean()),
|
|
List :: [Item],
|
|
Taken :: List,
|
|
Rest :: List.
|
|
% @doc similar to lists:takewhile but returns {Taken, Rest}. Name is
|
|
% to remind you it returns 2 things.
|
|
|
|
take_while(Pred, List) ->
|
|
take_while(Pred, [], List).
|
|
|
|
|
|
-spec take_while(Pred, Prefix, List) -> {Taken, Rest}
|
|
when Pred :: fun((Item) -> boolean()),
|
|
Prefix :: List,
|
|
List :: [Item],
|
|
Taken :: List,
|
|
Rest :: List.
|
|
% @doc
|
|
% similar to takewhile_ii/2, but returns {Prefix ++ Taken, Rest}
|
|
%
|
|
% where Prefix
|
|
%
|
|
% middle argument is just the accum
|
|
take_while(Pred, Pfx, List) ->
|
|
tw3(Pred, lists:reverse(Pfx), List).
|
|
|
|
|
|
tw3(Pred, Stk, [X | Xs]) ->
|
|
case Pred(X) of
|
|
true -> tw3(Pred, [X | Stk], Xs);
|
|
false -> {lists:reverse(Stk), [X | Xs]}
|
|
end;
|
|
tw3(_, Stk, []) ->
|
|
{lists:reverse(Stk), []}.
|