more mass renaming

This commit is contained in:
2026-06-02 01:48:05 -07:00
parent eff77fff6b
commit 270f192f0c
53 changed files with 1264 additions and 431 deletions
+802
View File
@@ -0,0 +1,802 @@
% @doc
% Ref: so_scan.erl
%
% This file contains a sophia tokenizer written in straightforward erlang with data
% types that are sane.
%
% For MVP it mimics the behavior of so_scan exactly, in terms of like what its
% definition of a token is and so on.
%
% gsc_so_scan.erl contains a compatibility layer that should agree with so_scan
% exactly. It converts the data types here to the shapes that so_scan outputs.
%
% This is for two reasons:
%
% 1. in order to enable testing the two modules against each other, and
% 2. to future-proof in case we decide to incrementally incorporate the gsc
% code into the legacy sophia compiler
% @end
-module(gs_tokens).
% meta
-export([
token_shapes_parse_order/0,
kwds/0
]).
-export([
take_while/2,
take_while/3,
take_block/1,
take_block_item/1,
strings/2,
slurp_plist/1
]).
% token slurping
-export([
indent_level/1,
is_significant/1,
filter_significant/1,
significant_tokens/1,
tokens_from_iolist/1,
tokens/1,
slurp_token/2,
slurp_token_shapes/3,
slurp_token_of_shape/3,
new_pos/2
]).
-include("$gsc_include/gsc.hrl").
%=======================================================
% API: functions
%=======================================================
-spec strings(N, Tokens) -> AtMostNStrings
when N :: non_neg_integer(),
Tokens :: [tk()],
AtMostNStrings :: [string()].
% @doc return the strings of the first N tokens
strings(N, [#tk{str = S} | Rest]) when is_integer(N), N >= 1 ->
[S | strings(N-1, Rest)];
strings(_, []) ->
[];
strings(0, _) ->
[].
% used by parser
%
% a block is a column-delimited list of block items
%
% BLOCK =
% foo
% ...
% bar
% ...
% baz
% ...
%
% BLOCK_ITEM =
% foo
% ...
-spec take_block(Tokens) -> {BlockTokens, Rest}
when Tokens :: [tk()],
BlockTokens :: Tokens,
Rest :: Tokens.
% @doc
% takes all tokens whose column position is >= the column position of
% the head token
take_block([H = #tk{pos = {_, BlkCol}} | T]) ->
TokenInBlock =
fun(#tk{pos = {_, TkCol}}) ->
BlkCol =< TkCol
end,
take_while(TokenInBlock, [H], T);
take_block([]) ->
{[], []}.
-spec take_block_item(Tokens) -> {ItemTokens, Rest}
when Tokens :: [tk()],
ItemTokens :: Tokens,
Rest :: Tokens.
% @doc
% takes all tokens whose column position is > the column position of
% the head token
take_block_item([H = #tk{pos = {_, ItemCol}} | T]) ->
TokenInItem =
fun(#tk{pos = {_, TkCol}}) ->
ItemCol < TkCol
end,
take_while(TokenInItem, [H], T);
take_block_item([]) ->
{[], []}.
-spec slurp_plist(Tokens) -> Result
when Tokens :: [Token],
Result :: {slurp, PList :: Tokens, After :: Tokens}
| {error, Mismatch},
Mismatch :: {fixme, mismatch, OpenStack, ClosedBy},
OpenStack :: Tokens,
ClosedBy :: none | {value, Token},
Token :: tk().
% @doc
% the verbiage here is `slurp' rather than `take' because we insist on
% delimiter matching.
%
% typical happy path:
% "(foo, bar) baz" ~> {slurp, "(foo, bar)", "baz"}
% "() baz" ~> {slurp, "()", "baz"}
% "foo () baz" ~> {slurp, "", "foo () baz"}
% "(foo, bar) baz" ~> {slurp, "(foo, bar)", "baz"}
%
% typical sad path:
% "(foo, bar]" ~> {mismatch, ["("], {value, "]"}}
% "(foo, bar" ~> {mismatch, ["("], none}
% "([foo, bar)" ~> {mismatch, ["[", "("], {value, ")"}}
%
% counterintuitive:
% "[foo, bar) baz" ~> {slurp, "", "[foo, bar) baz"}
% "~!!\inv4l1d syntax" ~> {slurp, "", "~!!\inv4l1d syntax"}
% "(foo, bar)(baz)" ~> {slurp, "(foo bar)", "(baz)"}
%
% the only "syntax checking" occurring is making sure the delimiter
% stack pushes and pops properly
%
% please note that on mismatch, the list of open delimiters is
% returned in STACK order, meaning the most recent open delimiters
% first. this is more convenient for programs, but might be
% counterintuitive to end-users (who are programmers, entirely
% unfamiliar with notions like stacks and open/close delimiters)
slurp_plist([Hd = #tk{str = "("} | Tl]) ->
slurp_dlist([Hd], [Hd], Tl);
slurp_plist(Tks) ->
{slurp, [], Tks}.
% happy terminal case: stack popped entirely
slurp_dlist(All, [], NewTokens) ->
{slurp, lists:reverse(All), NewTokens};
% WMA stack is nonempty
% happy cases of opens getting popped
slurp_dlist(All, [#tk{str = "("} | NewOpen],
[#tk{str = ")"} = Tk | NewTks]) ->
slurp_dlist([Tk | All], NewOpen, NewTks);
slurp_dlist(All, [#tk{str = "["} | NewOpen],
[#tk{str = "]"} = Tk | NewTks]) ->
slurp_dlist([Tk | All], NewOpen, NewTks);
slurp_dlist(All, [#tk{str = "{"} | NewOpen],
[#tk{str = "}"} = Tk | NewTks]) ->
slurp_dlist([Tk | All], NewOpen, NewTks);
% happy: open delimiters getting pushed
slurp_dlist(All, Opens, [#tk{str = "("} = Tk | NewTks]) ->
slurp_dlist([Tk | All], [Tk | Opens], NewTks);
slurp_dlist(All, Opens, [#tk{str = "["} = Tk | NewTks]) ->
slurp_dlist([Tk | All], [Tk | Opens], NewTks);
slurp_dlist(All, Opens, [#tk{str = "{"} = Tk | NewTks]) ->
slurp_dlist([Tk | All], [Tk | Opens], NewTks);
% sad: mismatch cases
slurp_dlist(All, Opens, []) ->
{error, {fixme, mismatch, Opens, none}};
slurp_dlist(All, Opens, [#tk{str = "}"} = BadClose | _]) ->
{error, {fixme, mismatch, Opens, {value, BadClose}}};
slurp_dlist(All, Opens, [#tk{str = "]"} = BadClose | _]) ->
{error, {fixme, mismatch, Opens, {value, BadClose}}};
slurp_dlist(All, Opens, [#tk{str = ")"} = BadClose | _]) ->
{error, {fixme, mismatch, Opens, {value, BadClose}}};
% general case: non-terminal token gets pushed
slurp_dlist(All, Opens, [Tk | NewTks]) ->
slurp_dlist([Tk | All], Opens, NewTks).
%-------------------------------------------------------
% API: meta info
%
% This is parse order definition, list of keywords, etc
%
% -export([
% token_shapes_parse_order/0,
% kwds/0
% ]).
%-------------------------------------------------------
-spec token_shapes_parse_order() -> [tk_shape()].
% @doc
% list of sophia token shapes in parse order (if an earlier shape matches, the later
% shape isn't even checked)
%
%
% Rules =
% %% Comments and whitespace
% [ CommentStart
% , {"//.*", skip()}
% , {WS, skip()}
%
% %% Special characters
% , {"\\.\\.|[,.;()\\[\\]{}]", symbol()}
%
% %% Literals
% , {CHAR, token(char, fun parse_char/1)}
% , {STRING, token(string, fun parse_string/1)}
% , {HEX, token(hex, fun parse_hex/1)}
% , {INT, token(int, fun parse_int/1)}
% , {BYTES, token(bytes, fun parse_bytes/1)}
%
% %% Identifiers (qualified first!)
% , {QID, token(qid, fun(S) -> string:tokens(S, ".") end)}
% , {QCON, token(qcon, fun(S) -> string:tokens(S, ".") end)}
% , {TVAR, token(tvar)}
% , override({ID, token(id)}, {KW, symbol()}) %% Keywords override identifiers. Need to
% , {CON, token(con)} %% use override to avoid lexing "lettuce"
% %% as ['let', {id, "tuce"}].
% %% Operators
% , {OP, symbol()}
% ],
% @end
token_shapes_parse_order() ->
% written in this style to be maximally editable
lists:flatten([
% comments and whitespace
lcom, bcom, ws,
punct,
% literals
char, string, int16, int10, bytes,
ak, ct, sg,
% qualified names need to go ahead of unqualifieds
qid, qcon,
tvar,
% keywords need to be parsed ahead of ids
kwd, id,
con,
% ops [=, =>, >>], punctuation (parens/braces)
op
]).
-spec kwds() -> list(string()).
% @doc list of sophia kwds
kwds() ->
["contract", "include", "let", "switch", "type", "record", "datatype",
"if", "elif", "else", "function", "stateful", "payable", "true", "false",
"mod", "public", "entrypoint", "private", "indexed", "namespace",
"interface", "main", "using", "as", "for", "hiding", "band", "bor",
"bxor", "bnot"].
%-------------------------------------------------------
% API: token slurping
%
% -export([
% tokens/1,
% slurp_token/1,
% slurp_token_shapes/2,
% slurp_token_of_shape/2
% ]).
%-------------------------------------------------------
% Token accessors
-spec indent_level(tk()) -> pos_integer().
indent_level(#tk{pos = {_, IndentLevel}}) ->
IndentLevel.
-spec significant_tokens(SrcStr) -> Result
when SrcStr :: iolist(),
Result :: {ok, Tokens}
| {error, gsc_err()},
Tokens :: [tk()].
significant_tokens(SrcStr) ->
case tokens(SrcStr) of
{ok, Tokens} ->
{ok, filter_significant(Tokens)};
Error ->
Error
end.
-spec filter_significant(Tokens) -> SignificantTokens
when Tokens :: [tk()],
SignificantTokens :: Tokens.
filter_significant(Tokens) ->
lists:filter(fun is_significant/1, Tokens).
-spec is_significant(Token) -> boolean()
when Token :: tk().
is_significant(#tk{shape = bcom}) -> false;
is_significant(#tk{shape = lcom}) -> false;
is_significant(#tk{shape = ws}) -> false;
is_significant(_) -> true.
-spec tokens_from_iolist(SrcStr) -> Result when
SrcStr :: iolist(),
Result :: {ok, Tokens}
| {error, gsc_err()},
Tokens :: [tk()].
% @doc alias for tokens/1
tokens_from_iolist(S) -> tokens(S).
-spec tokens(SrcStr) -> Result
when SrcStr :: iolist(),
Result :: {ok, Tokens}
| {error, gsc_err()},
Tokens :: [tk()].
% @doc
% Recursively parse all tokens off the front end of the string. `Rest' is
% the first tail of the string for which no token parser succeeds.
%
% Semantically, `Rest'` being nonempty amounts to the presence of an illegal
% character.
tokens(S) ->
% defensive normalization
tokens([], {1, 1}, unicode:characters_to_nfc_list(S)).
tokens(Stack, _FinalPos, "") ->
{ok, lists:reverse(Stack)};
tokens(Stack, Pos, SrcStr) ->
case slurp_token(Pos, SrcStr) of
{tokmatch, NewToken = #tk{str = TokStr},
NewSrcStr} ->
NewPos = new_pos(Pos, TokStr),
tokens([NewToken | Stack], NewPos, NewSrcStr);
no_tokmatch ->
PrevTokens = lists:reverse(Stack),
Err = #gsc_err_no_tokmatch{prev_tokens = PrevTokens,
break_pos = Pos,
rest = SrcStr},
{error, Err};
% FIXME so_scan bad
% this is so fucking stupid
% so_scan for some reason allows unterminated block comments at
% the end of files
%
% for now we're just going to agree with so_scan
{ierr, unterminated_block_comment} ->
PrevTokens = lists:reverse(Stack),
Err = #gsc_err_bcom_unterminated{prev_tokens = PrevTokens,
break_pos = Pos,
rest = SrcStr},
{error, Err};
Error = {error, _} ->
Error
end.
% alright some bullshit here
%
% we're computing the line/column position of each string
%
% however this is meant to be compatible with so_scan, so it's a bit wonky
% because regex list bullshit.
%
% recall that so_scan operates on the list representation of the utf-8 encoded
% bytes; this is different than on a list of bignum codepoints (e.g.
% unicode:characters_to_nfc_list(Bytes)); let's suppose some stupid complicated
% foreign character which a sane language would simply criminalize has list
% representation [ABC], but byte representation <<A,B,C>>
%
% as far as so_scan is concerned, this means the character ABC consumes 3
% columns. the only exception is tab characters, which always fast-forward to
% the next tab stop, which is 1-indexed because god hates all of us
%
% so the tab-stops are
% 1 9 17 25 33 ...
%
% column position is determined in all cases by byte order, EXCEPT for $\t
% which goes to the next tab stop
%
% so in general, for the token string, we need to convert to bytes first,
% then handle `\t` bytes as a special case
%
% again in the tokenizer context, we're assuming that the input to our
% tokenizer is an nfc-list which has a flat list of each unicode character in
% codepoint form
%
% here we're just converting it to byte form, then computing columns based on
% bytes
new_pos(OldPos, TokStr) ->
new_pos_bytes(unicode:characters_to_binary(TokStr), OldPos).
% newline just goes to {L+1, 1}
new_pos_bytes(<<$\n:8, Rest/bytes>>, _Pos = {L, _}) ->
NewPos = {L+1, 1},
new_pos_bytes(Rest, NewPos);
new_pos_bytes(<<$\t:8, Rest/bytes>>, _Pos = {Linum, Colnum1}) ->
% stinky wet sweaty robots need 1-based indexing
% so tab stops are at
% 1 9 17 25
% super awesome dry silicon robots use 0-based indexing
% so tab stops are at
% 0 8 16 25
Colnum0 = Colnum1 - 1,
% 0 based is based
NextTabstop0 = next_tabstop8(Colnum0),
NextTabstop1 = NextTabstop0 + 1,
NextPos = {Linum, NextTabstop1},
new_pos_bytes(Rest, NextPos);
new_pos_bytes(<<_:8, Rest/bytes>>, _Pos = {Linum, Colnum1}) ->
% in general advance by 1
new_pos_bytes(Rest, {Linum, Colnum1 + 1});
new_pos_bytes(<<>>, FinalPos) ->
FinalPos.
% 0 8 16 24 etc
% 0*8 1*8 2*8 3*8 etc
next_tabstop8(Col0) when Col0 >= 0 ->
% Col0 = PrevTabQ*8 + PrevTabR
PrevTabQ = Col0 div 8,
PrevTabR = Col0 rem 8,
Col0 = PrevTabQ*8 + PrevTabR,
NextTabQ = PrevTabQ + 1,
NextTabCol0 = NextTabQ*8,
NextTabCol0.
%% copied from so_scan_lib.erl just to match behavior
%-define(TAB_SIZE, 8).
%
%next_pos([], P) -> P;
%next_pos([$\n | S], {L, _}) -> next_pos(S, {L + 1, 1});
%next_pos([$\t | S], {L, C}) -> next_pos(S, {L, (C + ?TAB_SIZE - 1) div ?TAB_SIZE * ?TAB_SIZE + 1});
%next_pos([_ | S], {L, C}) -> next_pos(S, {L, C + 1}).
-spec slurp_token(Pos, SrcStr) -> Result
when Pos :: tk_pos(),
SrcStr :: string(),
Result :: {tokmatch, Token, Rest}
| no_tokmatch
| {error, gsc_err()}
| {ierr, unterminated_block_comment},
Token :: tk(),
Rest :: string().
% @doc
% grab a single token off the front of the string according to
% `token_shapes_parse_order/0'
slurp_token(Pos, SrcStr) ->
% this is the easiest format if i need to fuck with it
slurp_token_shapes(token_shapes_parse_order(), Pos, SrcStr).
-spec slurp_token_shapes(ParseOrder, Pos, SrcStr) -> Result
when ParseOrder :: [tk_shape()],
Pos :: tk_pos(),
SrcStr :: string(),
Result :: {tokmatch, Token, Rest}
| no_tokmatch
| {error, gsc_err()}
| {ierr, unterminated_block_comment},
Token :: tk(),
Rest :: string().
% @doc
% grab a single token off the front of the string according to
% `token_shapes_parse_order/0'
slurp_token_shapes([TokenType | TTs], Pos, SrcStr) ->
case slurp_token_of_shape(TokenType, Pos, SrcStr) of
Match = {tokmatch, _, _} -> Match;
no_tokmatch -> slurp_token_shapes(TTs, Pos, SrcStr);
IErr = {ierr, _} -> IErr;
Error = {error, _} -> Error
end;
slurp_token_shapes([], _Pos, _SrcStr) ->
no_tokmatch.
-spec slurp_token_of_shape(TokenType, Pos, SrcStr) -> MaybeToken
when TokenType :: tk_shape(),
Pos :: tk_pos(),
SrcStr :: string(),
MaybeToken :: {tokmatch, Token, Rest}
| no_tokmatch
| {error, gsc_err()}
| {ierr, unterminated_block_comment},
Token :: tk(),
Rest :: string().
% @doc
% match a sophia token of a given shape off the front of the string
% @end
% COMMENTS AND WHITESPACE: lcom, bcom, ws
%
% sophia line comment
%
% i am not going to bother writing a string matcher thing for this
% FIXME: make a string matcher for line comments
slurp_token_of_shape(lcom, Pos, SrcStr) ->
case SrcStr of
"//" ++ _ ->
{Line, Rest} = takeline("", SrcStr),
Token = #tk{shape = lcom,
pos = Pos,
str = Line},
{tokmatch, Token, Rest};
_ ->
no_tokmatch
end;
% Block comments cannot have a string matcher because they have a whole stack
% thing keeping track of depth because of nested block comments
slurp_token_of_shape(bcom, Pos, SrcStr0) ->
case SrcStr0 of
"/*" ++ SrcStr1 ->
case bcom("/*", 1, SrcStr1) of
{ok, CommentStr, SrcStr2} ->
Token = #tk{shape = bcom,
pos = Pos,
str = CommentStr},
{tokmatch, Token, SrcStr2};
Error ->
Error
end;
_ ->
no_tokmatch
end;
slurp_token_of_shape(ws, Pos, SrcStr) ->
WhitespaceMatcher = gs_strmatch:smr_sf_ws(),
case gs_strmatch:match(WhitespaceMatcher, SrcStr) of
no_strmatch ->
no_tokmatch;
{strmatch, WS, Rest} ->
Token = #tk{shape = ws,
pos = Pos,
str = WS},
{tokmatch, Token, Rest}
end;
% KEYWORDS, OPERATORS, PUNCTUATION: kwd, op, punct
%
% all the kwds are valid ids, so we match as an id and then check if it's a
% kwd
%
% kwds are allowed to be prefixes for user-defined variable names; e.g.
% "lettuce" should be parsed as an id, not as ["let", "tuce"]; for this reason
% we need to be careful with greedily parsing kwds
%
% we know kwds are always ids, so we parse it as an id and see if it's one
% of the kwds
slurp_token_of_shape(kwd, Pos, SrcStr) ->
case slurp_token_of_shape(id, Pos, SrcStr) of
{tokmatch, IdTok = #tk{str = IdStr}, Rest} ->
case lists:member(IdStr, kwds()) of
false ->
no_tokmatch;
true ->
KwTok = IdTok#tk{shape = kwd},
{tokmatch, KwTok, Rest}
end;
no_tokmatch ->
no_tokmatch
end;
slurp_token_of_shape(op, Pos, SrcStr) ->
case gs_strmatch:match(gs_strmatch:smr_sf_op(), SrcStr) of
{strmatch, Str, Rest} ->
Token = #tk{shape = op, pos = Pos, str = Str},
{tokmatch, Token, Rest};
no_strmatch ->
no_tokmatch
end;
slurp_token_of_shape(punct, Pos, SrcStr) ->
case gs_strmatch:match(gs_strmatch:smr_sf_punct(), SrcStr) of
{strmatch, Str, Rest} ->
Token = #tk{shape = punct, pos = Pos, str = Str},
{tokmatch, Token, Rest};
no_strmatch ->
no_tokmatch
end;
% SOPHIA VARIABLE NAMES: id, con, qid, qcon, tvar
slurp_token_of_shape(id, Pos, SrcStr) ->
case gs_strmatch:match(gs_strmatch:smr_sf_id(), SrcStr) of
{strmatch, IdStr, Rest} ->
Token = #tk{shape = id, pos = Pos, str = IdStr},
{tokmatch, Token, Rest};
no_strmatch ->
no_tokmatch
end;
slurp_token_of_shape(con, Pos, SrcStr) ->
case gs_strmatch:match(gs_strmatch:smr_sf_con(), SrcStr) of
{strmatch, Str, Rest} ->
Token = #tk{shape = con, pos = Pos, str = Str},
{tokmatch, Token, Rest};
no_strmatch ->
no_tokmatch
end;
slurp_token_of_shape(qid, Pos, SrcStr) ->
case gs_strmatch:match(gs_strmatch:smr_sf_qid(), SrcStr) of
{strmatch, Str, Rest} ->
Token = #tk{shape = qid, pos = Pos, str = Str},
{tokmatch, Token, Rest};
no_strmatch ->
no_tokmatch
end;
slurp_token_of_shape(qcon, Pos, SrcStr) ->
case gs_strmatch:match(gs_strmatch:smr_sf_qcon(), SrcStr) of
{strmatch, Str, Rest} ->
Token = #tk{shape = qcon, pos = Pos, str = Str},
{tokmatch, Token, Rest};
no_strmatch ->
no_tokmatch
end;
slurp_token_of_shape(tvar, Pos, SrcStr) ->
case gs_strmatch:match(gs_strmatch:smr_sf_tvar(), SrcStr) of
{strmatch, Str, Rest} ->
Token = #tk{shape = tvar, pos = Pos, str = Str},
{tokmatch, Token, Rest};
no_strmatch ->
no_tokmatch
end;
slurp_token_of_shape(int16, Pos, SrcStr) ->
case gs_strmatch:match(gs_strmatch:smr_sf_int16(), SrcStr) of
{strmatch, Str, Rest} ->
Token = #tk{shape = int16, pos = Pos, str = Str},
{tokmatch, Token, Rest};
no_strmatch ->
no_tokmatch
end;
slurp_token_of_shape(int10, Pos, SrcStr) ->
case gs_strmatch:match(gs_strmatch:smr_sf_int10(), SrcStr) of
{strmatch, Str, Rest} ->
Token = #tk{shape = int10, pos = Pos, str = Str},
{tokmatch, Token, Rest};
no_strmatch ->
no_tokmatch
end;
% LITERAL PARSERS: char, string, hex, int, bytes10, bytes16,
% ak, ct, sg
%
% char: sophia char literal
slurp_token_of_shape(ak, Pos, SrcStr) ->
StringMatcher = gs_strmatch:smr_sf_ak(),
case gs_strmatch:match(StringMatcher, SrcStr) of
no_strmatch ->
no_tokmatch;
{strmatch, TokenStr, Rest} ->
Token = #tk{shape = ak, pos = Pos, str = TokenStr},
{tokmatch, Token, Rest}
end;
slurp_token_of_shape(ct, Pos, SrcStr) ->
StringMatcher = gs_strmatch:smr_sf_ct(),
case gs_strmatch:match(StringMatcher, SrcStr) of
no_strmatch ->
no_tokmatch;
{strmatch, TokenStr, Rest} ->
Token = #tk{shape = ct, pos = Pos, str = TokenStr},
{tokmatch, Token, Rest}
end;
slurp_token_of_shape(sg, Pos, SrcStr) ->
StringMatcher = gs_strmatch:smr_sf_sg(),
case gs_strmatch:match(StringMatcher, SrcStr) of
no_strmatch ->
no_tokmatch;
{strmatch, TokenStr, Rest} ->
Token = #tk{shape = sg, pos = Pos, str = TokenStr},
{tokmatch, Token, Rest}
end;
slurp_token_of_shape(char, Pos, SrcStr) ->
StringMatcher = gs_strmatch:smr_sf_char(),
case gs_strmatch:match(StringMatcher, SrcStr) of
no_strmatch ->
no_tokmatch;
{strmatch, TokenStr, Rest} ->
Token = #tk{shape = char, pos = Pos, str = TokenStr},
{tokmatch, Token, Rest}
end;
slurp_token_of_shape(string, Pos, SrcStr) ->
case gs_strmatch:match(gs_strmatch:smr_sf_str(), SrcStr) of
no_strmatch ->
no_tokmatch;
{strmatch, TokenStr, Rest} ->
Token = #tk{shape = string, pos = Pos, str = TokenStr},
{tokmatch, Token, Rest}
end;
slurp_token_of_shape(bytes, Pos, SrcStr) ->
case gs_strmatch:match(gs_strmatch:smr_sf_bytes(), SrcStr) of
no_strmatch ->
no_tokmatch;
{strmatch, TokenStr, Rest} ->
Token = #tk{shape = bytes, pos = Pos, str = TokenStr},
{tokmatch, Token, Rest}
end;
slurp_token_of_shape(NyiType, Pos, SrcStr) ->
Message = io_lib:format("cannot slurp token of shape: ~p", [NyiType]),
error(#gsc_err{atom = nyi,
str = Message,
extra = [{token_shape, NyiType},
{pos, Pos},
{rest, SrcStr}]}).
takeline(Acc, "") -> {lists:reverse(Acc), ""};
takeline(Acc, Rest = "\n" ++ _) -> {lists:reverse(Acc), Rest};
takeline(Acc, [C | Rest]) -> takeline([C | Acc], Rest).
bcom(CommentStr, Depth, SrcStr0) when Depth > 0 ->
case SrcStr0 of
% premature end
"" ->
{ierr, unterminated_block_comment};
% decrease depth
"*/" ++ SrcStr1 ->
NewCommentStr = [CommentStr, "*/"],
NewDepth = Depth - 1,
bcom(NewCommentStr, NewDepth, SrcStr1);
% increase depth
"/*" ++ SrcStr1 ->
NewCommentStr = [CommentStr, "/*"],
NewDepth = Depth + 1,
bcom(NewCommentStr, NewDepth, SrcStr1);
% same depth, add to list
[C | SrcStr1] ->
NewCommentStr = [CommentStr, C],
bcom(NewCommentStr, Depth, SrcStr1)
end;
bcom(CommentStr, 0, SrcStr) ->
{ok, unicode:characters_to_nfc_list(CommentStr), SrcStr}.
%------------------------------------------
% INTERNAL UTILITIES
%------------------------------------------
-spec take_while(Pred, List) -> {Taken, Rest}
when Pred :: fun((Item) -> boolean()),
List :: [Item],
Taken :: List,
Rest :: List.
% @doc similar to lists:takewhile but returns {Taken, Rest}. Name is
% to remind you it returns 2 things.
take_while(Pred, List) ->
take_while(Pred, [], List).
-spec take_while(Pred, Prefix, List) -> {Taken, Rest}
when Pred :: fun((Item) -> boolean()),
Prefix :: List,
List :: [Item],
Taken :: List,
Rest :: List.
% @doc
% similar to takewhile_ii/2, but returns {Prefix ++ Taken, Rest}
%
% where Prefix
%
% middle argument is just the accum
take_while(Pred, Pfx, List) ->
tw3(Pred, lists:reverse(Pfx), List).
tw3(Pred, Stk, [X | Xs]) ->
case Pred(X) of
true -> tw3(Pred, [X | Stk], Xs);
false -> {lists:reverse(Stk), [X | Xs]}
end;
tw3(_, Stk, []) ->
{lists:reverse(Stk), []}.