more mass renaming

2026-06-02 01:48:05 -07:00
parent eff77fff6b
commit 270f192f0c
53 changed files with 1264 additions and 431 deletions
@@ -0,0 +1,802 @@
+% @doc
+% Ref: so_scan.erl
+%
+% This file contains a sophia tokenizer written in straightforward erlang with data
+% types that are sane.
+%
+% For MVP it mimics the behavior of so_scan exactly, in terms of like what its
+% definition of a token is and so on.
+%
+% gsc_so_scan.erl contains a compatibility layer that should agree with so_scan
+% exactly. It converts the data types here to the shapes that so_scan outputs.
+%
+% This is for two reasons:
+%
+% 1. in order to enable testing the two modules against each other, and
+% 2. to future-proof in case we decide to incrementally incorporate the gsc
+%    code into the legacy sophia compiler
+% @end
+-module(gs_tokens).
+
+% meta
+-export([
+    token_shapes_parse_order/0,
+    kwds/0
+]).
+
+-export([
+    take_while/2,
+    take_while/3,
+    take_block/1,
+    take_block_item/1,
+    strings/2,
+    slurp_plist/1
+]).
+
+% token slurping
+-export([
+    indent_level/1,
+    is_significant/1,
+    filter_significant/1,
+    significant_tokens/1,
+    tokens_from_iolist/1,
+    tokens/1,
+    slurp_token/2,
+    slurp_token_shapes/3,
+    slurp_token_of_shape/3,
+    new_pos/2
+]).
+
+-include("$gsc_include/gsc.hrl").
+
+
+%=======================================================
+% API: functions
+%=======================================================
+
+-spec strings(N, Tokens) -> AtMostNStrings
+    when N              :: non_neg_integer(),
+         Tokens         :: [tk()],
+         AtMostNStrings :: [string()].
+% @doc return the strings of the first N tokens
+
+strings(N, [#tk{str = S} | Rest]) when is_integer(N), N >= 1 ->
+    [S | strings(N-1, Rest)];
+strings(_, []) ->
+    [];
+strings(0, _) ->
+    [].
+
+
+% used by parser
+%
+% a block is a column-delimited list of block items
+%
+% BLOCK =
+%   foo
+%       ...
+%   bar
+%       ...
+%   baz
+%       ...
+%
+% BLOCK_ITEM =
+%   foo
+%       ...
+
+-spec take_block(Tokens) -> {BlockTokens, Rest}
+    when Tokens :: [tk()],
+         BlockTokens :: Tokens,
+         Rest :: Tokens.
+% @doc
+% takes all tokens whose column position is >= the column position of
+% the head token
+
+take_block([H = #tk{pos = {_, BlkCol}} | T]) ->
+    TokenInBlock =
+        fun(#tk{pos = {_, TkCol}}) ->
+            BlkCol =< TkCol
+        end,
+    take_while(TokenInBlock, [H], T);
+take_block([]) ->
+    {[], []}.
+
+
+
+-spec take_block_item(Tokens) -> {ItemTokens, Rest}
+    when Tokens :: [tk()],
+         ItemTokens :: Tokens,
+         Rest :: Tokens.
+% @doc
+% takes all tokens whose column position is > the column position of
+% the head token
+
+take_block_item([H = #tk{pos = {_, ItemCol}} | T]) ->
+    TokenInItem =
+        fun(#tk{pos = {_, TkCol}}) ->
+            ItemCol < TkCol
+        end,
+    take_while(TokenInItem, [H], T);
+take_block_item([]) ->
+    {[], []}.
+
+
+
+-spec slurp_plist(Tokens) -> Result
+    when Tokens    :: [Token],
+         Result    :: {slurp, PList :: Tokens, After :: Tokens}
+                    | {error, Mismatch},
+         Mismatch  :: {fixme, mismatch, OpenStack, ClosedBy},
+         OpenStack :: Tokens,
+         ClosedBy  :: none | {value, Token},
+         Token     :: tk().
+
+% @doc
+% the verbiage here is `slurp' rather than `take' because we insist on
+% delimiter matching.
+%
+% typical happy path:
+%   "(foo, bar) baz" ~> {slurp, "(foo, bar)", "baz"}
+%   "() baz"         ~> {slurp, "()",         "baz"}
+%   "foo () baz"     ~> {slurp, "",           "foo () baz"}
+%   "(foo, bar) baz" ~> {slurp, "(foo, bar)", "baz"}
+%
+% typical sad path:
+%   "(foo, bar]"  ~> {mismatch, ["("],      {value, "]"}}
+%   "(foo, bar"   ~> {mismatch, ["("],      none}
+%   "([foo, bar)" ~> {mismatch, ["[", "("], {value, ")"}}
+%
+% counterintuitive:
+%   "[foo, bar) baz"     ~> {slurp, "", "[foo, bar) baz"}
+%   "~!!\inv4l1d syntax" ~> {slurp, "", "~!!\inv4l1d syntax"}
+%   "(foo, bar)(baz)"    ~> {slurp, "(foo bar)", "(baz)"}
+%
+% the only "syntax checking" occurring is making sure the delimiter
+% stack pushes and pops properly
+%
+% please note that on mismatch, the list of open delimiters is
+% returned in STACK order, meaning the most recent open delimiters
+% first. this is more convenient for programs, but might be
+% counterintuitive to end-users (who are programmers, entirely
+% unfamiliar with notions like stacks and open/close delimiters)
+
+slurp_plist([Hd = #tk{str = "("} | Tl]) ->
+    slurp_dlist([Hd], [Hd], Tl);
+slurp_plist(Tks) ->
+    {slurp, [], Tks}.
+
+
+% happy terminal case: stack popped entirely
+slurp_dlist(All, [], NewTokens) ->
+    {slurp, lists:reverse(All), NewTokens};
+% WMA stack is nonempty
+% happy cases of opens getting popped
+slurp_dlist(All, [#tk{str = "("}      | NewOpen],
+                 [#tk{str = ")"} = Tk | NewTks]) ->
+    slurp_dlist([Tk | All], NewOpen, NewTks);
+slurp_dlist(All, [#tk{str = "["}      | NewOpen],
+                 [#tk{str = "]"} = Tk | NewTks]) ->
+    slurp_dlist([Tk | All], NewOpen, NewTks);
+slurp_dlist(All, [#tk{str = "{"}      | NewOpen],
+                 [#tk{str = "}"} = Tk | NewTks]) ->
+    slurp_dlist([Tk | All], NewOpen, NewTks);
+% happy: open delimiters getting pushed
+slurp_dlist(All, Opens, [#tk{str = "("} = Tk | NewTks]) ->
+    slurp_dlist([Tk | All], [Tk | Opens], NewTks);
+slurp_dlist(All, Opens, [#tk{str = "["} = Tk | NewTks]) ->
+    slurp_dlist([Tk | All], [Tk | Opens], NewTks);
+slurp_dlist(All, Opens, [#tk{str = "{"} = Tk | NewTks]) ->
+    slurp_dlist([Tk | All], [Tk | Opens], NewTks);
+% sad: mismatch cases
+slurp_dlist(All, Opens, []) ->
+    {error, {fixme, mismatch, Opens, none}};
+slurp_dlist(All, Opens, [#tk{str = "}"} = BadClose | _]) ->
+    {error, {fixme, mismatch, Opens, {value, BadClose}}};
+slurp_dlist(All, Opens, [#tk{str = "]"} = BadClose | _]) ->
+    {error, {fixme, mismatch, Opens, {value, BadClose}}};
+slurp_dlist(All, Opens, [#tk{str = ")"} = BadClose | _]) ->
+    {error, {fixme, mismatch, Opens, {value, BadClose}}};
+% general case: non-terminal token gets pushed
+slurp_dlist(All, Opens, [Tk | NewTks]) ->
+    slurp_dlist([Tk | All], Opens, NewTks).
+
+
+%-------------------------------------------------------
+% API: meta info
+%
+% This is parse order definition, list of keywords, etc
+%
+% -export([
+%     token_shapes_parse_order/0,
+%     kwds/0
+% ]).
+%-------------------------------------------------------
+
+-spec token_shapes_parse_order() -> [tk_shape()].
+% @doc
+% list of sophia token shapes in parse order (if an earlier shape matches, the later
+% shape isn't even checked)
+%
+%
+%    Rules =
+%          %% Comments and whitespace
+%        [ CommentStart
+%        , {"//.*", skip()}
+%        , {WS,     skip()}
+%
+%          %% Special characters
+%        , {"\\.\\.|[,.;()\\[\\]{}]", symbol()}
+%
+%          %% Literals
+%        , {CHAR,   token(char,   fun parse_char/1)}
+%        , {STRING, token(string, fun parse_string/1)}
+%        , {HEX,    token(hex,    fun parse_hex/1)}
+%        , {INT,    token(int,    fun parse_int/1)}
+%        , {BYTES,  token(bytes,  fun parse_bytes/1)}
+%
+%          %% Identifiers (qualified first!)
+%        , {QID,   token(qid,  fun(S) -> string:tokens(S, ".") end)}
+%        , {QCON,  token(qcon, fun(S) -> string:tokens(S, ".") end)}
+%        , {TVAR,  token(tvar)}
+%        , override({ID, token(id)}, {KW, symbol()})    %% Keywords override identifiers. Need to
+%        , {CON, token(con)}                            %% use override to avoid lexing "lettuce"
+%                                                       %% as ['let', {id, "tuce"}].
+%          %% Operators
+%        , {OP, symbol()}
+%        ],
+% @end
+
+token_shapes_parse_order() ->
+    % written in this style to be maximally editable
+    lists:flatten([
+        % comments and whitespace
+        lcom, bcom, ws,
+        punct,
+        % literals
+        char, string, int16, int10, bytes,
+        ak, ct, sg,
+        % qualified names need to go ahead of unqualifieds
+        qid, qcon,
+        tvar,
+        % keywords need to be parsed ahead of ids
+        kwd, id,
+        con,
+        % ops [=, =>, >>], punctuation (parens/braces)
+        op
+    ]).
+
+
+-spec kwds() -> list(string()).
+% @doc list of sophia kwds
+
+kwds() ->
+    ["contract", "include", "let", "switch", "type", "record", "datatype",
+     "if", "elif", "else", "function", "stateful", "payable", "true", "false",
+     "mod", "public", "entrypoint", "private", "indexed", "namespace",
+     "interface", "main", "using", "as", "for", "hiding", "band", "bor",
+     "bxor", "bnot"].
+
+
+%-------------------------------------------------------
+% API: token slurping
+%
+% -export([
+%     tokens/1,
+%     slurp_token/1,
+%     slurp_token_shapes/2,
+%     slurp_token_of_shape/2
+% ]).
+%-------------------------------------------------------
+
+% Token accessors
+-spec indent_level(tk()) -> pos_integer().
+
+indent_level(#tk{pos = {_, IndentLevel}}) ->
+    IndentLevel.
+
+
+
+-spec significant_tokens(SrcStr) -> Result
+    when SrcStr :: iolist(),
+         Result :: {ok, Tokens}
+                 | {error, gsc_err()},
+         Tokens :: [tk()].
+
+significant_tokens(SrcStr) ->
+    case tokens(SrcStr) of
+        {ok, Tokens} ->
+            {ok, filter_significant(Tokens)};
+        Error ->
+            Error
+    end.
+
+
+
+-spec filter_significant(Tokens) -> SignificantTokens
+    when Tokens            :: [tk()],
+         SignificantTokens :: Tokens.
+
+filter_significant(Tokens) ->
+    lists:filter(fun is_significant/1, Tokens).
+
+
+
+-spec is_significant(Token) -> boolean()
+    when Token :: tk().
+
+is_significant(#tk{shape = bcom}) -> false;
+is_significant(#tk{shape = lcom}) -> false;
+is_significant(#tk{shape = ws})   -> false;
+is_significant(_)                           -> true.
+
+
+-spec tokens_from_iolist(SrcStr) -> Result when
+        SrcStr  :: iolist(),
+        Result  :: {ok, Tokens}
+                 | {error, gsc_err()},
+         Tokens :: [tk()].
+
+% @doc alias for tokens/1
+tokens_from_iolist(S) -> tokens(S).
+
+
+
+-spec tokens(SrcStr) -> Result
+    when SrcStr :: iolist(),
+         Result :: {ok, Tokens}
+                 | {error, gsc_err()},
+         Tokens :: [tk()].
+% @doc
+% Recursively parse all tokens off the front end of the string. `Rest' is
+% the first tail of the string for which no token parser succeeds.
+%
+% Semantically, `Rest'` being nonempty amounts to the presence of an illegal
+% character.
+
+tokens(S) ->
+    % defensive normalization
+    tokens([], {1, 1}, unicode:characters_to_nfc_list(S)).
+
+tokens(Stack, _FinalPos, "") ->
+    {ok, lists:reverse(Stack)};
+tokens(Stack, Pos, SrcStr) ->
+    case slurp_token(Pos, SrcStr) of
+        {tokmatch, NewToken = #tk{str = TokStr},
+                   NewSrcStr} ->
+            NewPos = new_pos(Pos, TokStr),
+            tokens([NewToken | Stack], NewPos, NewSrcStr);
+        no_tokmatch ->
+            PrevTokens = lists:reverse(Stack),
+            Err = #gsc_err_no_tokmatch{prev_tokens = PrevTokens,
+                                       break_pos   = Pos,
+                                       rest        = SrcStr},
+            {error, Err};
+        % FIXME so_scan bad
+        % this is so fucking stupid
+        % so_scan for some reason allows unterminated block comments at
+        % the end of files
+        %
+        % for now we're just going to agree with so_scan
+        {ierr, unterminated_block_comment} ->
+            PrevTokens = lists:reverse(Stack),
+            Err = #gsc_err_bcom_unterminated{prev_tokens = PrevTokens,
+                                             break_pos   = Pos,
+                                             rest        = SrcStr},
+            {error, Err};
+        Error = {error, _} ->
+            Error
+    end.
+
+% alright some bullshit here
+%
+% we're computing the line/column position of each string
+%
+% however this is meant to be compatible with so_scan, so it's a bit wonky
+% because regex list bullshit.
+%
+% recall that so_scan operates on the list representation of the utf-8 encoded
+% bytes; this is different than on a list of bignum codepoints (e.g.
+% unicode:characters_to_nfc_list(Bytes)); let's suppose some stupid complicated
+% foreign character which a sane language would simply criminalize has list
+% representation [ABC], but byte representation <<A,B,C>>
+%
+% as far as so_scan is concerned, this means the character ABC consumes 3
+% columns. the only exception is tab characters, which always fast-forward to
+% the next tab stop, which is 1-indexed because god hates all of us
+%
+% so the tab-stops are
+%   1 9 17 25 33 ...
+%
+% column position is determined in all cases by byte order, EXCEPT for $\t
+% which goes to the next tab stop
+%
+% so in general, for the token string, we need to convert to bytes first,
+% then handle `\t` bytes as a special case
+%
+% again in the tokenizer context, we're assuming that the input to our
+% tokenizer is an nfc-list which has a flat list of each unicode character in
+% codepoint form
+%
+% here we're just converting it to byte form, then computing columns based on
+% bytes
+new_pos(OldPos, TokStr) ->
+    new_pos_bytes(unicode:characters_to_binary(TokStr), OldPos).
+
+% newline just goes to {L+1, 1}
+new_pos_bytes(<<$\n:8, Rest/bytes>>, _Pos = {L, _}) ->
+    NewPos = {L+1, 1},
+    new_pos_bytes(Rest, NewPos);
+new_pos_bytes(<<$\t:8, Rest/bytes>>, _Pos = {Linum, Colnum1}) ->
+    % stinky wet sweaty robots need 1-based indexing
+    % so tab stops are at
+    %   1 9 17 25
+    % super awesome dry silicon robots use 0-based indexing
+    % so tab stops are at
+    %   0 8 16 25
+    Colnum0 = Colnum1 - 1,
+    % 0 based is based
+    NextTabstop0 = next_tabstop8(Colnum0),
+    NextTabstop1 = NextTabstop0 + 1,
+    NextPos      = {Linum, NextTabstop1},
+    new_pos_bytes(Rest, NextPos);
+new_pos_bytes(<<_:8, Rest/bytes>>, _Pos = {Linum, Colnum1}) ->
+    % in general advance by 1
+    new_pos_bytes(Rest, {Linum, Colnum1 + 1});
+new_pos_bytes(<<>>, FinalPos) ->
+    FinalPos.
+
+% 0     8       16      24      etc
+% 0*8   1*8     2*8     3*8     etc
+next_tabstop8(Col0) when Col0 >= 0 ->
+    % Col0 = PrevTabQ*8 + PrevTabR
+    PrevTabQ = Col0 div 8,
+    PrevTabR = Col0 rem 8,
+    Col0 = PrevTabQ*8 + PrevTabR,
+    NextTabQ = PrevTabQ + 1,
+    NextTabCol0 = NextTabQ*8,
+    NextTabCol0.
+
+%% copied from so_scan_lib.erl just to match behavior
+%-define(TAB_SIZE, 8).
+%
+%next_pos([], P) -> P;
+%next_pos([$\n | S], {L, _}) -> next_pos(S, {L + 1, 1});
+%next_pos([$\t | S], {L, C}) -> next_pos(S, {L, (C + ?TAB_SIZE - 1) div ?TAB_SIZE * ?TAB_SIZE + 1});
+%next_pos([_   | S], {L, C}) -> next_pos(S, {L, C + 1}).
+
+
+
+-spec slurp_token(Pos, SrcStr) -> Result
+    when Pos    :: tk_pos(),
+         SrcStr :: string(),
+         Result :: {tokmatch, Token, Rest}
+                 | no_tokmatch
+                 | {error, gsc_err()}
+                 | {ierr, unterminated_block_comment},
+         Token  :: tk(),
+         Rest   :: string().
+% @doc
+% grab a single token off the front of the string according to
+% `token_shapes_parse_order/0'
+
+slurp_token(Pos, SrcStr) ->
+    % this is the easiest format if i need to fuck with it
+    slurp_token_shapes(token_shapes_parse_order(), Pos, SrcStr).
+
+
+
+-spec slurp_token_shapes(ParseOrder, Pos, SrcStr) -> Result
+    when ParseOrder :: [tk_shape()],
+         Pos        :: tk_pos(),
+         SrcStr     :: string(),
+         Result     :: {tokmatch, Token, Rest}
+                     | no_tokmatch
+                     | {error, gsc_err()}
+                     | {ierr, unterminated_block_comment},
+         Token      :: tk(),
+         Rest       :: string().
+% @doc
+% grab a single token off the front of the string according to
+% `token_shapes_parse_order/0'
+
+slurp_token_shapes([TokenType | TTs], Pos, SrcStr) ->
+    case slurp_token_of_shape(TokenType, Pos, SrcStr) of
+        Match = {tokmatch, _, _} -> Match;
+        no_tokmatch              -> slurp_token_shapes(TTs, Pos, SrcStr);
+        IErr  = {ierr, _}        -> IErr;
+        Error = {error, _}       -> Error
+    end;
+slurp_token_shapes([], _Pos, _SrcStr) ->
+    no_tokmatch.
+
+
+-spec slurp_token_of_shape(TokenType, Pos, SrcStr) -> MaybeToken
+    when TokenType  :: tk_shape(),
+         Pos        :: tk_pos(),
+         SrcStr     :: string(),
+         MaybeToken :: {tokmatch, Token, Rest}
+                     | no_tokmatch
+                     | {error, gsc_err()}
+                     | {ierr, unterminated_block_comment},
+         Token      :: tk(),
+         Rest       :: string().
+% @doc
+% match a sophia token of a given shape off the front of the string
+% @end
+
+% COMMENTS AND WHITESPACE: lcom, bcom, ws
+%
+% sophia line comment
+%
+% i am not going to bother writing a string matcher thing for this
+% FIXME: make a string matcher for line comments
+slurp_token_of_shape(lcom, Pos, SrcStr) ->
+    case SrcStr of
+        "//" ++ _ ->
+            {Line, Rest} = takeline("", SrcStr),
+            Token = #tk{shape   = lcom,
+                                   pos    = Pos,
+                                   str = Line},
+            {tokmatch, Token, Rest};
+        _ ->
+            no_tokmatch
+    end;
+% Block comments cannot have a string matcher because they have a whole stack
+% thing keeping track of depth because of nested block comments
+slurp_token_of_shape(bcom, Pos, SrcStr0) ->
+    case SrcStr0 of
+        "/*" ++ SrcStr1 ->
+            case bcom("/*", 1, SrcStr1) of
+                {ok, CommentStr, SrcStr2} ->
+                    Token = #tk{shape   = bcom,
+                                           pos    = Pos,
+                                           str = CommentStr},
+                    {tokmatch, Token, SrcStr2};
+                Error ->
+                    Error
+            end;
+        _ ->
+            no_tokmatch
+    end;
+slurp_token_of_shape(ws, Pos, SrcStr) ->
+    WhitespaceMatcher = gs_strmatch:smr_sf_ws(),
+    case gs_strmatch:match(WhitespaceMatcher, SrcStr) of
+        no_strmatch ->
+            no_tokmatch;
+        {strmatch, WS, Rest} ->
+            Token = #tk{shape   = ws,
+                                   pos    = Pos,
+                                   str = WS},
+            {tokmatch, Token, Rest}
+    end;
+% KEYWORDS, OPERATORS, PUNCTUATION: kwd, op, punct
+%
+% all the kwds are valid ids, so we match as an id and then check if it's a
+% kwd
+%
+% kwds are allowed to be prefixes for user-defined variable names; e.g.
+% "lettuce" should be parsed as an id, not as ["let", "tuce"]; for this reason
+% we need to be careful with greedily parsing kwds
+%
+% we know kwds are always ids, so we parse it as an id and see if it's one
+% of the kwds
+slurp_token_of_shape(kwd, Pos, SrcStr) ->
+    case slurp_token_of_shape(id, Pos, SrcStr) of
+        {tokmatch, IdTok = #tk{str = IdStr}, Rest} ->
+            case lists:member(IdStr, kwds()) of
+                false ->
+                    no_tokmatch;
+                true ->
+                    KwTok = IdTok#tk{shape = kwd},
+                    {tokmatch, KwTok, Rest}
+            end;
+        no_tokmatch ->
+            no_tokmatch
+    end;
+slurp_token_of_shape(op, Pos, SrcStr) ->
+    case gs_strmatch:match(gs_strmatch:smr_sf_op(), SrcStr) of
+        {strmatch, Str, Rest} ->
+            Token = #tk{shape = op, pos = Pos, str = Str},
+            {tokmatch, Token, Rest};
+        no_strmatch ->
+            no_tokmatch
+    end;
+slurp_token_of_shape(punct, Pos, SrcStr) ->
+    case gs_strmatch:match(gs_strmatch:smr_sf_punct(), SrcStr) of
+        {strmatch, Str, Rest} ->
+            Token = #tk{shape = punct, pos = Pos, str = Str},
+            {tokmatch, Token, Rest};
+        no_strmatch ->
+            no_tokmatch
+    end;
+% SOPHIA VARIABLE NAMES: id, con, qid, qcon, tvar
+slurp_token_of_shape(id, Pos, SrcStr) ->
+    case gs_strmatch:match(gs_strmatch:smr_sf_id(), SrcStr) of
+        {strmatch, IdStr, Rest} ->
+            Token = #tk{shape = id, pos = Pos, str = IdStr},
+            {tokmatch, Token, Rest};
+        no_strmatch ->
+            no_tokmatch
+    end;
+slurp_token_of_shape(con, Pos, SrcStr) ->
+    case gs_strmatch:match(gs_strmatch:smr_sf_con(), SrcStr) of
+        {strmatch, Str, Rest} ->
+            Token = #tk{shape = con, pos = Pos, str = Str},
+            {tokmatch, Token, Rest};
+        no_strmatch ->
+            no_tokmatch
+    end;
+slurp_token_of_shape(qid, Pos, SrcStr) ->
+    case gs_strmatch:match(gs_strmatch:smr_sf_qid(), SrcStr) of
+        {strmatch, Str, Rest} ->
+            Token = #tk{shape = qid, pos = Pos, str = Str},
+            {tokmatch, Token, Rest};
+        no_strmatch ->
+            no_tokmatch
+    end;
+slurp_token_of_shape(qcon, Pos, SrcStr) ->
+    case gs_strmatch:match(gs_strmatch:smr_sf_qcon(), SrcStr) of
+        {strmatch, Str, Rest} ->
+            Token = #tk{shape = qcon, pos = Pos, str = Str},
+            {tokmatch, Token, Rest};
+        no_strmatch ->
+            no_tokmatch
+    end;
+slurp_token_of_shape(tvar, Pos, SrcStr) ->
+    case gs_strmatch:match(gs_strmatch:smr_sf_tvar(), SrcStr) of
+        {strmatch, Str, Rest} ->
+            Token = #tk{shape = tvar, pos = Pos, str = Str},
+            {tokmatch, Token, Rest};
+        no_strmatch ->
+            no_tokmatch
+    end;
+slurp_token_of_shape(int16, Pos, SrcStr) ->
+    case gs_strmatch:match(gs_strmatch:smr_sf_int16(), SrcStr) of
+        {strmatch, Str, Rest} ->
+            Token = #tk{shape = int16, pos = Pos, str = Str},
+            {tokmatch, Token, Rest};
+        no_strmatch ->
+            no_tokmatch
+    end;
+slurp_token_of_shape(int10, Pos, SrcStr) ->
+    case gs_strmatch:match(gs_strmatch:smr_sf_int10(), SrcStr) of
+        {strmatch, Str, Rest} ->
+            Token = #tk{shape = int10, pos = Pos, str = Str},
+            {tokmatch, Token, Rest};
+        no_strmatch ->
+            no_tokmatch
+    end;
+% LITERAL PARSERS: char, string, hex, int, bytes10, bytes16,
+%                  ak, ct, sg
+%
+% char: sophia char literal
+slurp_token_of_shape(ak, Pos, SrcStr) ->
+    StringMatcher = gs_strmatch:smr_sf_ak(),
+    case gs_strmatch:match(StringMatcher, SrcStr) of
+        no_strmatch ->
+            no_tokmatch;
+        {strmatch, TokenStr, Rest} ->
+            Token = #tk{shape = ak, pos = Pos, str = TokenStr},
+            {tokmatch, Token, Rest}
+    end;
+slurp_token_of_shape(ct, Pos, SrcStr) ->
+    StringMatcher = gs_strmatch:smr_sf_ct(),
+    case gs_strmatch:match(StringMatcher, SrcStr) of
+        no_strmatch ->
+            no_tokmatch;
+        {strmatch, TokenStr, Rest} ->
+            Token = #tk{shape = ct, pos = Pos, str = TokenStr},
+            {tokmatch, Token, Rest}
+    end;
+slurp_token_of_shape(sg, Pos, SrcStr) ->
+    StringMatcher = gs_strmatch:smr_sf_sg(),
+    case gs_strmatch:match(StringMatcher, SrcStr) of
+        no_strmatch ->
+            no_tokmatch;
+        {strmatch, TokenStr, Rest} ->
+            Token = #tk{shape = sg, pos = Pos, str = TokenStr},
+            {tokmatch, Token, Rest}
+    end;
+slurp_token_of_shape(char, Pos, SrcStr) ->
+    StringMatcher = gs_strmatch:smr_sf_char(),
+    case gs_strmatch:match(StringMatcher, SrcStr) of
+        no_strmatch ->
+            no_tokmatch;
+        {strmatch, TokenStr, Rest} ->
+            Token = #tk{shape = char, pos = Pos, str = TokenStr},
+            {tokmatch, Token, Rest}
+    end;
+slurp_token_of_shape(string, Pos, SrcStr) ->
+    case gs_strmatch:match(gs_strmatch:smr_sf_str(), SrcStr) of
+        no_strmatch ->
+            no_tokmatch;
+        {strmatch, TokenStr, Rest} ->
+            Token = #tk{shape = string, pos = Pos, str = TokenStr},
+            {tokmatch, Token, Rest}
+    end;
+slurp_token_of_shape(bytes, Pos, SrcStr) ->
+    case gs_strmatch:match(gs_strmatch:smr_sf_bytes(), SrcStr) of
+        no_strmatch ->
+            no_tokmatch;
+        {strmatch, TokenStr, Rest} ->
+            Token = #tk{shape = bytes, pos = Pos, str = TokenStr},
+            {tokmatch, Token, Rest}
+    end;
+slurp_token_of_shape(NyiType, Pos, SrcStr) ->
+    Message = io_lib:format("cannot slurp token of shape: ~p", [NyiType]),
+    error(#gsc_err{atom   = nyi,
+                   str = Message,
+                   extra  = [{token_shape, NyiType},
+                             {pos, Pos},
+                             {rest, SrcStr}]}).
+
+
+
+takeline(Acc, "")               -> {lists:reverse(Acc), ""};
+takeline(Acc, Rest = "\n" ++ _) -> {lists:reverse(Acc), Rest};
+takeline(Acc, [C | Rest])       -> takeline([C | Acc], Rest).
+
+
+bcom(CommentStr, Depth, SrcStr0) when Depth > 0 ->
+    case SrcStr0 of
+        % premature end
+        "" ->
+            {ierr, unterminated_block_comment};
+        % decrease depth
+        "*/" ++ SrcStr1 ->
+            NewCommentStr = [CommentStr, "*/"],
+            NewDepth = Depth - 1,
+            bcom(NewCommentStr, NewDepth, SrcStr1);
+        % increase depth
+        "/*" ++ SrcStr1 ->
+            NewCommentStr = [CommentStr, "/*"],
+            NewDepth = Depth + 1,
+            bcom(NewCommentStr, NewDepth, SrcStr1);
+        % same depth, add to list
+        [C | SrcStr1] ->
+            NewCommentStr = [CommentStr, C],
+            bcom(NewCommentStr, Depth, SrcStr1)
+    end;
+bcom(CommentStr, 0, SrcStr) ->
+    {ok, unicode:characters_to_nfc_list(CommentStr), SrcStr}.
+
+
+%------------------------------------------
+% INTERNAL UTILITIES
+%------------------------------------------
+
+-spec take_while(Pred, List) -> {Taken, Rest}
+    when Pred  :: fun((Item) -> boolean()),
+         List  :: [Item],
+         Taken :: List,
+         Rest  :: List.
+% @doc similar to lists:takewhile but returns {Taken, Rest}.  Name is
+% to remind you it returns 2 things.
+
+take_while(Pred, List) ->
+    take_while(Pred, [], List).
+
+
+-spec take_while(Pred, Prefix, List) -> {Taken, Rest}
+    when Pred  :: fun((Item) -> boolean()),
+         Prefix :: List,
+         List   :: [Item],
+         Taken  :: List,
+         Rest   :: List.
+% @doc
+% similar to takewhile_ii/2, but returns {Prefix ++ Taken, Rest}
+%
+% where Prefix
+%
+% middle argument is just the accum
+take_while(Pred, Pfx, List) ->
+    tw3(Pred, lists:reverse(Pfx), List).
+
+
+tw3(Pred, Stk, [X | Xs]) ->
+    case Pred(X) of
+        true  -> tw3(Pred, [X | Stk], Xs);
+        false -> {lists:reverse(Stk), [X | Xs]}
+    end;
+tw3(_, Stk, []) ->
+    {lists:reverse(Stk), []}.