gsc/src/gs_tokens.erl

% @doc
% Ref: so_scan.erl
%
% This file contains a sophia tokenizer written in straightforward erlang with data
% types that are sane.
%
% For MVP it mimics the behavior of so_scan exactly, in terms of like what its
% definition of a token is and so on.
%
% gsc_so_scan.erl contains a compatibility layer that should agree with so_scan
% exactly. It converts the data types here to the shapes that so_scan outputs.
%
% This is for two reasons:
%
% 1. in order to enable testing the two modules against each other, and
% 2. to future-proof in case we decide to incrementally incorporate the gsc
%    code into the legacy sophia compiler
% @end
-module(gs_tokens).

% meta
-export([
    token_shapes_parse_order/0,
    kwds/0
]).

-export([
    take_while/2,
    take_while/3,
    take_block/1,
    take_block_item/1,
    strings/2,
    slurp_plist/1
]).

% token slurping
-export([
    indent_level/1,
    is_significant/1,
    filter_significant/1,
    significant_tokens/1,
    tokens_from_iolist/1,
    tokens/1,
    slurp_token/2,
    slurp_token_shapes/3,
    slurp_token_of_shape/3,
    new_pos/2
]).

-include("$gsc_include/gsc.hrl").


%=======================================================
% API: functions
%=======================================================

-spec strings(N, Tokens) -> AtMostNStrings
    when N              :: non_neg_integer(),
         Tokens         :: [tk()],
         AtMostNStrings :: [string()].
% @doc return the strings of the first N tokens

strings(N, [#tk{str = S} | Rest]) when is_integer(N), N >= 1 ->
    [S | strings(N-1, Rest)];
strings(_, []) ->
    [];
strings(0, _) ->
    [].


% used by parser
%
% a block is a column-delimited list of block items
%
% BLOCK =
%   foo
%       ...
%   bar
%       ...
%   baz
%       ...
%
% BLOCK_ITEM =
%   foo
%       ...

-spec take_block(Tokens) -> {BlockTokens, Rest}
    when Tokens :: [tk()],
         BlockTokens :: Tokens,
         Rest :: Tokens.
% @doc
% takes all tokens whose column position is >= the column position of
% the head token

take_block([H = #tk{pos = {_, BlkCol}} | T]) ->
    TokenInBlock =
        fun(#tk{pos = {_, TkCol}}) ->
            BlkCol =< TkCol
        end,
    take_while(TokenInBlock, [H], T);
take_block([]) ->
    {[], []}.


-spec take_block_item(Tokens) -> {ItemTokens, Rest}
    when Tokens :: [tk()],
         ItemTokens :: Tokens,
         Rest :: Tokens.
% @doc
% takes all tokens whose column position is > the column position of
% the head token

take_block_item([H = #tk{pos = {_, ItemCol}} | T]) ->
    TokenInItem =
        fun(#tk{pos = {_, TkCol}}) ->
            ItemCol < TkCol
        end,
    take_while(TokenInItem, [H], T);
take_block_item([]) ->
    {[], []}.


-spec slurp_plist(Tokens) -> Result
    when Tokens    :: [Token],
         Result    :: {slurp, PList :: Tokens, After :: Tokens}
                    | {error, Mismatch},
         Mismatch  :: {fixme, mismatch, OpenStack, ClosedBy},
         OpenStack :: Tokens,
         ClosedBy  :: none | {value, Token},
         Token     :: tk().

% @doc
% the verbiage here is `slurp' rather than `take' because we insist on
% delimiter matching.
%
% typical happy path:
%   "(foo, bar) baz" ~> {slurp, "(foo, bar)", "baz"}
%   "() baz"         ~> {slurp, "()",         "baz"}
%   "foo () baz"     ~> {slurp, "",           "foo () baz"}
%   "(foo, bar) baz" ~> {slurp, "(foo, bar)", "baz"}
%
% typical sad path:
%   "(foo, bar]"  ~> {mismatch, ["("],      {value, "]"}}
%   "(foo, bar"   ~> {mismatch, ["("],      none}
%   "([foo, bar)" ~> {mismatch, ["[", "("], {value, ")"}}
%
% counterintuitive:
%   "[foo, bar) baz"     ~> {slurp, "", "[foo, bar) baz"}
%   "~!!\inv4l1d syntax" ~> {slurp, "", "~!!\inv4l1d syntax"}
%   "(foo, bar)(baz)"    ~> {slurp, "(foo bar)", "(baz)"}
%
% the only "syntax checking" occurring is making sure the delimiter
% stack pushes and pops properly
%
% please note that on mismatch, the list of open delimiters is
% returned in STACK order, meaning the most recent open delimiters
% first. this is more convenient for programs, but might be
% counterintuitive to end-users (who are programmers, entirely
% unfamiliar with notions like stacks and open/close delimiters)

slurp_plist([Hd = #tk{str = "("} | Tl]) ->
    slurp_dlist([Hd], [Hd], Tl);
slurp_plist(Tks) ->
    {slurp, [], Tks}.


% happy terminal case: stack popped entirely
slurp_dlist(All, [], NewTokens) ->
    {slurp, lists:reverse(All), NewTokens};
% WMA stack is nonempty
% happy cases of opens getting popped
slurp_dlist(All, [#tk{str = "("}      | NewOpen],
                 [#tk{str = ")"} = Tk | NewTks]) ->
    slurp_dlist([Tk | All], NewOpen, NewTks);
slurp_dlist(All, [#tk{str = "["}      | NewOpen],
                 [#tk{str = "]"} = Tk | NewTks]) ->
    slurp_dlist([Tk | All], NewOpen, NewTks);
slurp_dlist(All, [#tk{str = "{"}      | NewOpen],
                 [#tk{str = "}"} = Tk | NewTks]) ->
    slurp_dlist([Tk | All], NewOpen, NewTks);
% happy: open delimiters getting pushed
slurp_dlist(All, Opens, [#tk{str = "("} = Tk | NewTks]) ->
    slurp_dlist([Tk | All], [Tk | Opens], NewTks);
slurp_dlist(All, Opens, [#tk{str = "["} = Tk | NewTks]) ->
    slurp_dlist([Tk | All], [Tk | Opens], NewTks);
slurp_dlist(All, Opens, [#tk{str = "{"} = Tk | NewTks]) ->
    slurp_dlist([Tk | All], [Tk | Opens], NewTks);
% sad: mismatch cases
slurp_dlist(All, Opens, []) ->
    {error, {fixme, mismatch, Opens, none}};
slurp_dlist(All, Opens, [#tk{str = "}"} = BadClose | _]) ->
    {error, {fixme, mismatch, Opens, {value, BadClose}}};
slurp_dlist(All, Opens, [#tk{str = "]"} = BadClose | _]) ->
    {error, {fixme, mismatch, Opens, {value, BadClose}}};
slurp_dlist(All, Opens, [#tk{str = ")"} = BadClose | _]) ->
    {error, {fixme, mismatch, Opens, {value, BadClose}}};
% general case: non-terminal token gets pushed
slurp_dlist(All, Opens, [Tk | NewTks]) ->
    slurp_dlist([Tk | All], Opens, NewTks).


%-------------------------------------------------------
% API: meta info
%
% This is parse order definition, list of keywords, etc
%
% -export([
%     token_shapes_parse_order/0,
%     kwds/0
% ]).
%-------------------------------------------------------

-spec token_shapes_parse_order() -> [tk_shape()].
% @doc
% list of sophia token shapes in parse order (if an earlier shape matches, the later
% shape isn't even checked)
%
%
%    Rules =
%          %% Comments and whitespace
%        [ CommentStart
%        , {"//.*", skip()}
%        , {WS,     skip()}
%
%          %% Special characters
%        , {"\\.\\.|[,.;()\\[\\]{}]", symbol()}
%
%          %% Literals
%        , {CHAR,   token(char,   fun parse_char/1)}
%        , {STRING, token(string, fun parse_string/1)}
%        , {HEX,    token(hex,    fun parse_hex/1)}
%        , {INT,    token(int,    fun parse_int/1)}
%        , {BYTES,  token(bytes,  fun parse_bytes/1)}
%
%          %% Identifiers (qualified first!)
%        , {QID,   token(qid,  fun(S) -> string:tokens(S, ".") end)}
%        , {QCON,  token(qcon, fun(S) -> string:tokens(S, ".") end)}
%        , {TVAR,  token(tvar)}
%        , override({ID, token(id)}, {KW, symbol()})    %% Keywords override identifiers. Need to
%        , {CON, token(con)}                            %% use override to avoid lexing "lettuce"
%                                                       %% as ['let', {id, "tuce"}].
%          %% Operators
%        , {OP, symbol()}
%        ],
% @end

token_shapes_parse_order() ->
    % written in this style to be maximally editable
    lists:flatten([
        % comments and whitespace
        lcom, bcom, ws,
        punct,
        % literals
        char, string, int16, int10, bytes,
        ak, ct, sg,
        % qualified names need to go ahead of unqualifieds
        qid, qcon,
        tvar,
        % keywords need to be parsed ahead of ids
        kwd, id,
        con,
        % ops [=, =>, >>], punctuation (parens/braces)
        op
    ]).


-spec kwds() -> list(string()).
% @doc list of sophia kwds

kwds() ->
    ["contract", "include", "let", "switch", "type", "record", "datatype",
     "if", "elif", "else", "function", "stateful", "payable", "true", "false",
     "mod", "public", "entrypoint", "private", "indexed", "namespace",
     "interface", "main", "using", "as", "for", "hiding", "band", "bor",
     "bxor", "bnot"].


%-------------------------------------------------------
% API: token slurping
%
% -export([
%     tokens/1,
%     slurp_token/1,
%     slurp_token_shapes/2,
%     slurp_token_of_shape/2
% ]).
%-------------------------------------------------------

% Token accessors
-spec indent_level(tk()) -> pos_integer().

indent_level(#tk{pos = {_, IndentLevel}}) ->
    IndentLevel.


-spec significant_tokens(SrcStr) -> Result
    when SrcStr :: iolist(),
         Result :: {ok, Tokens}
                 | {error, gsc_err()},
         Tokens :: [tk()].

significant_tokens(SrcStr) ->
    case tokens(SrcStr) of
        {ok, Tokens} ->
            {ok, filter_significant(Tokens)};
        Error ->
            Error
    end.


-spec filter_significant(Tokens) -> SignificantTokens
    when Tokens            :: [tk()],
         SignificantTokens :: Tokens.

filter_significant(Tokens) ->
    lists:filter(fun is_significant/1, Tokens).


-spec is_significant(Token) -> boolean()
    when Token :: tk().

is_significant(#tk{shape = bcom}) -> false;
is_significant(#tk{shape = lcom}) -> false;
is_significant(#tk{shape = ws})   -> false;
is_significant(_)                           -> true.


-spec tokens_from_iolist(SrcStr) -> Result when
        SrcStr  :: iolist(),
        Result  :: {ok, Tokens}
                 | {error, gsc_err()},
         Tokens :: [tk()].

% @doc alias for tokens/1
tokens_from_iolist(S) -> tokens(S).


-spec tokens(SrcStr) -> Result
    when SrcStr :: iolist(),
         Result :: {ok, Tokens}
                 | {error, gsc_err()},
         Tokens :: [tk()].
% @doc
% Recursively parse all tokens off the front end of the string. `Rest' is
% the first tail of the string for which no token parser succeeds.
%
% Semantically, `Rest'` being nonempty amounts to the presence of an illegal
% character.

tokens(S) ->
    % defensive normalization
    tokens([], {1, 1}, unicode:characters_to_nfc_list(S)).

tokens(Stack, _FinalPos, "") ->
    {ok, lists:reverse(Stack)};
tokens(Stack, Pos, SrcStr) ->
    case slurp_token(Pos, SrcStr) of
        {tokmatch, NewToken = #tk{str = TokStr},
                   NewSrcStr} ->
            NewPos = new_pos(Pos, TokStr),
            tokens([NewToken | Stack], NewPos, NewSrcStr);
        no_tokmatch ->
            PrevTokens = lists:reverse(Stack),
            Err = #gsc_err_no_tokmatch{prev_tokens = PrevTokens,
                                       break_pos   = Pos,
                                       rest        = SrcStr},
            {error, Err};
        % FIXME so_scan bad
        % this is so fucking stupid
        % so_scan for some reason allows unterminated block comments at
        % the end of files
        %
        % for now we're just going to agree with so_scan
        {ierr, unterminated_block_comment} ->
            PrevTokens = lists:reverse(Stack),
            Err = #gsc_err_bcom_unterminated{prev_tokens = PrevTokens,
                                             break_pos   = Pos,
                                             rest        = SrcStr},
            {error, Err};
        Error = {error, _} ->
            Error
    end.

% alright some bullshit here
%
% we're computing the line/column position of each string
%
% however this is meant to be compatible with so_scan, so it's a bit wonky
% because regex list bullshit.
%
% recall that so_scan operates on the list representation of the utf-8 encoded
% bytes; this is different than on a list of bignum codepoints (e.g.
% unicode:characters_to_nfc_list(Bytes)); let's suppose some stupid complicated
% foreign character which a sane language would simply criminalize has list
% representation [ABC], but byte representation <<A,B,C>>
%
% as far as so_scan is concerned, this means the character ABC consumes 3
% columns. the only exception is tab characters, which always fast-forward to
% the next tab stop, which is 1-indexed because god hates all of us
%
% so the tab-stops are
%   1 9 17 25 33 ...
%
% column position is determined in all cases by byte order, EXCEPT for $\t
% which goes to the next tab stop
%
% so in general, for the token string, we need to convert to bytes first,
% then handle `\t` bytes as a special case
%
% again in the tokenizer context, we're assuming that the input to our
% tokenizer is an nfc-list which has a flat list of each unicode character in
% codepoint form
%
% here we're just converting it to byte form, then computing columns based on
% bytes
new_pos(OldPos, TokStr) ->
    new_pos_bytes(unicode:characters_to_binary(TokStr), OldPos).

% newline just goes to {L+1, 1}
new_pos_bytes(<<$\n:8, Rest/bytes>>, _Pos = {L, _}) ->
    NewPos = {L+1, 1},
    new_pos_bytes(Rest, NewPos);
new_pos_bytes(<<$\t:8, Rest/bytes>>, _Pos = {Linum, Colnum1}) ->
    % stinky wet sweaty robots need 1-based indexing
    % so tab stops are at
    %   1 9 17 25
    % super awesome dry silicon robots use 0-based indexing
    % so tab stops are at
    %   0 8 16 25
    Colnum0 = Colnum1 - 1,
    % 0 based is based
    NextTabstop0 = next_tabstop8(Colnum0),
    NextTabstop1 = NextTabstop0 + 1,
    NextPos      = {Linum, NextTabstop1},
    new_pos_bytes(Rest, NextPos);
new_pos_bytes(<<_:8, Rest/bytes>>, _Pos = {Linum, Colnum1}) ->
    % in general advance by 1
    new_pos_bytes(Rest, {Linum, Colnum1 + 1});
new_pos_bytes(<<>>, FinalPos) ->
    FinalPos.

% 0     8       16      24      etc
% 0*8   1*8     2*8     3*8     etc
next_tabstop8(Col0) when Col0 >= 0 ->
    % Col0 = PrevTabQ*8 + PrevTabR
    PrevTabQ = Col0 div 8,
    PrevTabR = Col0 rem 8,
    Col0 = PrevTabQ*8 + PrevTabR,
    NextTabQ = PrevTabQ + 1,
    NextTabCol0 = NextTabQ*8,
    NextTabCol0.

%% copied from so_scan_lib.erl just to match behavior
%-define(TAB_SIZE, 8).
%
%next_pos([], P) -> P;
%next_pos([$\n | S], {L, _}) -> next_pos(S, {L + 1, 1});
%next_pos([$\t | S], {L, C}) -> next_pos(S, {L, (C + ?TAB_SIZE - 1) div ?TAB_SIZE * ?TAB_SIZE + 1});
%next_pos([_   | S], {L, C}) -> next_pos(S, {L, C + 1}).


-spec slurp_token(Pos, SrcStr) -> Result
    when Pos    :: tk_pos(),
         SrcStr :: string(),
         Result :: {tokmatch, Token, Rest}
                 | no_tokmatch
                 | {error, gsc_err()}
                 | {ierr, unterminated_block_comment},
         Token  :: tk(),
         Rest   :: string().
% @doc
% grab a single token off the front of the string according to
% `token_shapes_parse_order/0'

slurp_token(Pos, SrcStr) ->
    % this is the easiest format if i need to fuck with it
    slurp_token_shapes(token_shapes_parse_order(), Pos, SrcStr).


-spec slurp_token_shapes(ParseOrder, Pos, SrcStr) -> Result
    when ParseOrder :: [tk_shape()],
         Pos        :: tk_pos(),
         SrcStr     :: string(),
         Result     :: {tokmatch, Token, Rest}
                     | no_tokmatch
                     | {error, gsc_err()}
                     | {ierr, unterminated_block_comment},
         Token      :: tk(),
         Rest       :: string().
% @doc
% grab a single token off the front of the string according to
% `token_shapes_parse_order/0'

slurp_token_shapes([TokenType | TTs], Pos, SrcStr) ->
    case slurp_token_of_shape(TokenType, Pos, SrcStr) of
        Match = {tokmatch, _, _} -> Match;
        no_tokmatch              -> slurp_token_shapes(TTs, Pos, SrcStr);
        IErr  = {ierr, _}        -> IErr;
        Error = {error, _}       -> Error
    end;
slurp_token_shapes([], _Pos, _SrcStr) ->
    no_tokmatch.


-spec slurp_token_of_shape(TokenType, Pos, SrcStr) -> MaybeToken
    when TokenType  :: tk_shape(),
         Pos        :: tk_pos(),
         SrcStr     :: string(),
         MaybeToken :: {tokmatch, Token, Rest}
                     | no_tokmatch
                     | {error, gsc_err()}
                     | {ierr, unterminated_block_comment},
         Token      :: tk(),
         Rest       :: string().
% @doc
% match a sophia token of a given shape off the front of the string
% @end

% COMMENTS AND WHITESPACE: lcom, bcom, ws
%
% sophia line comment
%
% i am not going to bother writing a string matcher thing for this
% FIXME: make a string matcher for line comments
slurp_token_of_shape(lcom, Pos, SrcStr) ->
    case SrcStr of
        "//" ++ _ ->
            {Line, Rest} = takeline("", SrcStr),
            Token = #tk{shape   = lcom,
                                   pos    = Pos,
                                   str = Line},
            {tokmatch, Token, Rest};
        _ ->
            no_tokmatch
    end;
% Block comments cannot have a string matcher because they have a whole stack
% thing keeping track of depth because of nested block comments
slurp_token_of_shape(bcom, Pos, SrcStr0) ->
    case SrcStr0 of
        "/*" ++ SrcStr1 ->
            case bcom("/*", 1, SrcStr1) of
                {ok, CommentStr, SrcStr2} ->
                    Token = #tk{shape   = bcom,
                                           pos    = Pos,
                                           str = CommentStr},
                    {tokmatch, Token, SrcStr2};
                Error ->
                    Error
            end;
        _ ->
            no_tokmatch
    end;
slurp_token_of_shape(ws, Pos, SrcStr) ->
    WhitespaceMatcher = gs_strmatch:smr_sf_ws(),
    case gs_strmatch:match(WhitespaceMatcher, SrcStr) of
        no_strmatch ->
            no_tokmatch;
        {strmatch, WS, Rest} ->
            Token = #tk{shape   = ws,
                                   pos    = Pos,
                                   str = WS},
            {tokmatch, Token, Rest}
    end;
% KEYWORDS, OPERATORS, PUNCTUATION: kwd, op, punct
%
% all the kwds are valid ids, so we match as an id and then check if it's a
% kwd
%
% kwds are allowed to be prefixes for user-defined variable names; e.g.
% "lettuce" should be parsed as an id, not as ["let", "tuce"]; for this reason
% we need to be careful with greedily parsing kwds
%
% we know kwds are always ids, so we parse it as an id and see if it's one
% of the kwds
slurp_token_of_shape(kwd, Pos, SrcStr) ->
    case slurp_token_of_shape(id, Pos, SrcStr) of
        {tokmatch, IdTok = #tk{str = IdStr}, Rest} ->
            case lists:member(IdStr, kwds()) of
                false ->
                    no_tokmatch;
                true ->
                    KwTok = IdTok#tk{shape = kwd},
                    {tokmatch, KwTok, Rest}
            end;
        no_tokmatch ->
            no_tokmatch
    end;
slurp_token_of_shape(op, Pos, SrcStr) ->
    case gs_strmatch:match(gs_strmatch:smr_sf_op(), SrcStr) of
        {strmatch, Str, Rest} ->
            Token = #tk{shape = op, pos = Pos, str = Str},
            {tokmatch, Token, Rest};
        no_strmatch ->
            no_tokmatch
    end;
slurp_token_of_shape(punct, Pos, SrcStr) ->
    case gs_strmatch:match(gs_strmatch:smr_sf_punct(), SrcStr) of
        {strmatch, Str, Rest} ->
            Token = #tk{shape = punct, pos = Pos, str = Str},
            {tokmatch, Token, Rest};
        no_strmatch ->
            no_tokmatch
    end;
% SOPHIA VARIABLE NAMES: id, con, qid, qcon, tvar
slurp_token_of_shape(id, Pos, SrcStr) ->
    case gs_strmatch:match(gs_strmatch:smr_sf_id(), SrcStr) of
        {strmatch, IdStr, Rest} ->
            Token = #tk{shape = id, pos = Pos, str = IdStr},
            {tokmatch, Token, Rest};
        no_strmatch ->
            no_tokmatch
    end;
slurp_token_of_shape(con, Pos, SrcStr) ->
    case gs_strmatch:match(gs_strmatch:smr_sf_con(), SrcStr) of
        {strmatch, Str, Rest} ->
            Token = #tk{shape = con, pos = Pos, str = Str},
            {tokmatch, Token, Rest};
        no_strmatch ->
            no_tokmatch
    end;
slurp_token_of_shape(qid, Pos, SrcStr) ->
    case gs_strmatch:match(gs_strmatch:smr_sf_qid(), SrcStr) of
        {strmatch, Str, Rest} ->
            Token = #tk{shape = qid, pos = Pos, str = Str},
            {tokmatch, Token, Rest};
        no_strmatch ->
            no_tokmatch
    end;
slurp_token_of_shape(qcon, Pos, SrcStr) ->
    case gs_strmatch:match(gs_strmatch:smr_sf_qcon(), SrcStr) of
        {strmatch, Str, Rest} ->
            Token = #tk{shape = qcon, pos = Pos, str = Str},
            {tokmatch, Token, Rest};
        no_strmatch ->
            no_tokmatch
    end;
slurp_token_of_shape(tvar, Pos, SrcStr) ->
    case gs_strmatch:match(gs_strmatch:smr_sf_tvar(), SrcStr) of
        {strmatch, Str, Rest} ->
            Token = #tk{shape = tvar, pos = Pos, str = Str},
            {tokmatch, Token, Rest};
        no_strmatch ->
            no_tokmatch
    end;
slurp_token_of_shape(int16, Pos, SrcStr) ->
    case gs_strmatch:match(gs_strmatch:smr_sf_int16(), SrcStr) of
        {strmatch, Str, Rest} ->
            Token = #tk{shape = int16, pos = Pos, str = Str},
            {tokmatch, Token, Rest};
        no_strmatch ->
            no_tokmatch
    end;
slurp_token_of_shape(int10, Pos, SrcStr) ->
    case gs_strmatch:match(gs_strmatch:smr_sf_int10(), SrcStr) of
        {strmatch, Str, Rest} ->
            Token = #tk{shape = int10, pos = Pos, str = Str},
            {tokmatch, Token, Rest};
        no_strmatch ->
            no_tokmatch
    end;
% LITERAL PARSERS: char, string, hex, int, bytes10, bytes16,
%                  ak, ct, sg
%
% char: sophia char literal
slurp_token_of_shape(ak, Pos, SrcStr) ->
    StringMatcher = gs_strmatch:smr_sf_ak(),
    case gs_strmatch:match(StringMatcher, SrcStr) of
        no_strmatch ->
            no_tokmatch;
        {strmatch, TokenStr, Rest} ->
            Token = #tk{shape = ak, pos = Pos, str = TokenStr},
            {tokmatch, Token, Rest}
    end;
slurp_token_of_shape(ct, Pos, SrcStr) ->
    StringMatcher = gs_strmatch:smr_sf_ct(),
    case gs_strmatch:match(StringMatcher, SrcStr) of
        no_strmatch ->
            no_tokmatch;
        {strmatch, TokenStr, Rest} ->
            Token = #tk{shape = ct, pos = Pos, str = TokenStr},
            {tokmatch, Token, Rest}
    end;
slurp_token_of_shape(sg, Pos, SrcStr) ->
    StringMatcher = gs_strmatch:smr_sf_sg(),
    case gs_strmatch:match(StringMatcher, SrcStr) of
        no_strmatch ->
            no_tokmatch;
        {strmatch, TokenStr, Rest} ->
            Token = #tk{shape = sg, pos = Pos, str = TokenStr},
            {tokmatch, Token, Rest}
    end;
slurp_token_of_shape(char, Pos, SrcStr) ->
    StringMatcher = gs_strmatch:smr_sf_char(),
    case gs_strmatch:match(StringMatcher, SrcStr) of
        no_strmatch ->
            no_tokmatch;
        {strmatch, TokenStr, Rest} ->
            Token = #tk{shape = char, pos = Pos, str = TokenStr},
            {tokmatch, Token, Rest}
    end;
slurp_token_of_shape(string, Pos, SrcStr) ->
    case gs_strmatch:match(gs_strmatch:smr_sf_str(), SrcStr) of
        no_strmatch ->
            no_tokmatch;
        {strmatch, TokenStr, Rest} ->
            Token = #tk{shape = string, pos = Pos, str = TokenStr},
            {tokmatch, Token, Rest}
    end;
slurp_token_of_shape(bytes, Pos, SrcStr) ->
    case gs_strmatch:match(gs_strmatch:smr_sf_bytes(), SrcStr) of
        no_strmatch ->
            no_tokmatch;
        {strmatch, TokenStr, Rest} ->
            Token = #tk{shape = bytes, pos = Pos, str = TokenStr},
            {tokmatch, Token, Rest}
    end;
slurp_token_of_shape(NyiType, Pos, SrcStr) ->
    Message = io_lib:format("cannot slurp token of shape: ~p", [NyiType]),
    error(#gsc_err{atom   = nyi,
                   str = Message,
                   extra  = [{token_shape, NyiType},
                             {pos, Pos},
                             {rest, SrcStr}]}).


takeline(Acc, "")               -> {lists:reverse(Acc), ""};
takeline(Acc, Rest = "\n" ++ _) -> {lists:reverse(Acc), Rest};
takeline(Acc, [C | Rest])       -> takeline([C | Acc], Rest).


bcom(CommentStr, Depth, SrcStr0) when Depth > 0 ->
    case SrcStr0 of
        % premature end
        "" ->
            {ierr, unterminated_block_comment};
        % decrease depth
        "*/" ++ SrcStr1 ->
            NewCommentStr = [CommentStr, "*/"],
            NewDepth = Depth - 1,
            bcom(NewCommentStr, NewDepth, SrcStr1);
        % increase depth
        "/*" ++ SrcStr1 ->
            NewCommentStr = [CommentStr, "/*"],
            NewDepth = Depth + 1,
            bcom(NewCommentStr, NewDepth, SrcStr1);
        % same depth, add to list
        [C | SrcStr1] ->
            NewCommentStr = [CommentStr, C],
            bcom(NewCommentStr, Depth, SrcStr1)
    end;
bcom(CommentStr, 0, SrcStr) ->
    {ok, unicode:characters_to_nfc_list(CommentStr), SrcStr}.


%------------------------------------------
% INTERNAL UTILITIES
%------------------------------------------

-spec take_while(Pred, List) -> {Taken, Rest}
    when Pred  :: fun((Item) -> boolean()),
         List  :: [Item],
         Taken :: List,
         Rest  :: List.
% @doc similar to lists:takewhile but returns {Taken, Rest}.  Name is
% to remind you it returns 2 things.

take_while(Pred, List) ->
    take_while(Pred, [], List).


-spec take_while(Pred, Prefix, List) -> {Taken, Rest}
    when Pred  :: fun((Item) -> boolean()),
         Prefix :: List,
         List   :: [Item],
         Taken  :: List,
         Rest   :: List.
% @doc
% similar to takewhile_ii/2, but returns {Prefix ++ Taken, Rest}
%
% where Prefix
%
% middle argument is just the accum
take_while(Pred, Pfx, List) ->
    tw3(Pred, lists:reverse(Pfx), List).


tw3(Pred, Stk, [X | Xs]) ->
    case Pred(X) of
        true  -> tw3(Pred, [X | Stk], Xs);
        false -> {lists:reverse(Stk), [X | Xs]}
    end;
tw3(_, Stk, []) ->
    {lists:reverse(Stk), []}.