gsc/src/gso_scan.erl

% @doc compatibility layer to test against so_scan
%
% converts gsc_tokens data to so_scan tokens
%
% Ref: so_scan.erl
-module(gso_scan).

-export_type([
    so_kwd/0,
    so_special_char/0,
    so_symbol/0,
    so_token2/0,
    so_token3/0,
    so_token/0
]).

-export([
    scan/1,
    ken_barson_rises/2
]).

-include("$gsc_include/gsc.hrl").

%================================
% API: types
%================================

% FIXME: single-quote all the atoms to future-proof against the elixir retards
% adding more keywords to erlang
-type so_kwd() :: contract
                | include
                | 'let'
                | switch
                | type
                | record
                | datatype
                | 'if'
                | elif
                | 'else'
                | function
                | stateful
                | payable
                | 'true'
                | 'false'
                | mod
                | public
                | entrypoint
                | private
                | indexed
                | namespace
                | interface
                | main
                | using
                | as
                | for
                | hiding
                | 'band'
                | 'bor'
                | 'bxor'
                | 'bnot'.

-type so_special_char() :: '..'
                         | ','
                         | '.'
                         | ';'
                         | '('
                         | ')'
                         | '['
                         | ']'
                         | '{'
                         | '}'
                         .

% @doc bad type... essentially a string that is the outcome of a regex match is
% cast to an atom, and that's the type that goes here
-type so_symbol() :: so_kwd() | so_special_char() | atom().

-type so_token2() :: {Symbol   :: so_symbol(),
                      Location :: tk_pos()}.

% FIXME
% this is 'id', 'con', qid
-type so_tk3type() :: char | string | hex | int | bytes | qid | qcon | tvar | id | con.

-type so_token3() :: {TokenType  :: so_tk3type(),
                      Location   :: tk_pos(),
                      TokenValue :: term()}.

-type so_token() :: so_token2() | so_token3().


%================================
% API: functions
%================================

-spec scan(SrcStr) -> {ok, SoTokens} | {error, gsc_err()}
    when SrcStr   :: iolist(),
         SoTokens :: [so_token()].
% @doc
% this is meant to agree with so_scan:scan/1 in all cases
%
% this converts gsc's internal representation of tokens into the format that
% so_scan outputs
% @end

scan(SrcStr) ->
    case gsc_tokens:tokens(SrcStr) of
        {ok, SfLTokens} ->
            SoTokens = to_so_tokens(SfLTokens),
            {ok, SoTokens};
        % fucking stupid
        {error, #gsc_err_bcom_unterminated{prev_tokens = GscTokens}} ->
            {ok, to_so_tokens(GscTokens)};
        Error ->
            Error
    end.


-spec to_so_tokens(GscTokens) -> SoTokens
    when GscTokens    :: [tk()],
         SoTokens     :: [so_token()].

% @doc
% most gsc tokens map 1-to-1 with so_tokens. the
% exception is ak/ct/sg literals. this is a
% many-to-one-mapping, and therefore ak, sg, ct need to
% be handled at the list level.
%
% the reason is as follows:
%
% so_scan lexes ak_ABCD to an id, then at the parsing
% stage computes the pubkey that corresponds to.
%
% as a result, if we have ak_GHI, I is not a valid
% base58 char, so WE (gsc) end up lexing that as
%
%   [{ak, "ak_GH"}, {con, "I"}]
%
% and so_scan lexes that as {id, "ak_GHI"}].
%
% however we also don't ignore whitespace, so we can
% tell if this happens because it occurs precisely when
% an ak/sg/ct is immediately followed by a
% non-whitespace token. however there could be more
% than 1 and they can be a variety of different shapes.
% so we have to greedily consume them back into a
% single id.
%
% bugs in the happy path are trans-features
%
% so if we see an ak/ct/sg token, we summon evil ben
% carson to reconjoin the unconjoined twins
to_so_tokens([ AkTok = #tk{shape = AkCtSg, pos = Pos}
             | Sheeit])
        when ak =:= AkCtSg;
             ct =:= AkCtSg;
             sg =:= AkCtSg ->
    {#tk{str = FinalAkStr}, NewSheeit}
        = ken_barson_rises(AkTok, Sheeit),
    [{id, Pos, FinalAkStr}| to_so_tokens(NewSheeit)];
% this part is just lists:filtermap
to_so_tokens([X | Xs]) ->
    case to_so_token(X) of
        false           -> to_so_tokens(Xs);
        {true, SoToken} -> [SoToken | to_so_tokens(Xs)]
    end;
to_so_tokens([]) ->
    [].


-spec ken_barson_rises(InitApiToken, SfToks) -> {FinalApiToken, NewSfToks}
    when InitApiToken  :: tk(),
         SfToks        :: [tk()],
         FinalApiToken :: InitApiToken,
         NewSfToks     :: SfToks.
% @doc
%
%                  .-""""""""""""-.
%               .-'   .-======-.   '-.
%             .'     /  .----.  \     '.
%            /      |  /      \  |      \
%           |       | |  @  @  | |       |
%           |       | |   __   | |       |
%           |       | |  /@@\  | |       |
%            \      | |  \__/  | |      /
%             '.    |  \_++++_/  |    .'
%               '-._|   |\/\/|   |_.-'
%                   |   |/\/\|   |
%                   |   \____/   |
%                ___|  BEN CARSON |___
%             .-'   |  HAS BECOME |   '-.
%            /      |  TOO POWERFUL|      \
%           /       |______________|       \
%          /      .-'''-.      .-'''-.      \
%         |      /  .-.  \    /  .-.  \      |
%         |     |  (   )  |  |  (   )  |     |
%         |      \  '-'  /    \  '-'  /      |
%          \      '-...-'  /\  '-...-'      /
%           '._           /  \           _.'
%              '-._____.-'    '-._____.-'
%
%              THE SOFT-SPOKEN DOOM DOCTOR
%               “I prescribed… CHAOS.”
%
%         BUGS IN THE HAPPY PATH ARE features.
%         BUGS IN THE HAPPY PATH ARE features.
%
%                  WE LIKE features.
%
%               features MAKE US MONEY.
%
%                features ARE NOT FOOD.
%                features ARE friends.
% @end


% This function takes the unconjoined twins (e.g.
% `ak_GHI` lexed to `ak_GH` followed by `I`) and
% recursively reconjoins them so they can all live
% happily together as a single so_scan token which will
% fail in the parsing step.
%
% on account of the property that the concatenation of
% all the token strings equals the original source file
% (FIXME: should test this in test suite)
%
% basically this looks at the next token, and if it's a
% type that so_scan is going to consume as part of an
% `id` token, then we add it to the stack.
%
% quoth claude:
%   so_scan lexes identifiers with
%   /[a-z_][a-zA-Z0-9_']*/. The base58 alphabet used by
%   `smr_apistr58` is:
%
%       123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz
%
%   Characters **valid in a so_scan id tail** but
%   **absent from base58**:
%
%   | char | why excluded from base58              |
%   |------|---------------------------------------|
%   | `0`  | looks like `O`                        |
%   | `I`  | looks like `1` or `l`                 |
%   | `O`  | looks like `0`                        |
%   | `l`  | looks like `1` or `I`                 |
%   | `_`  | not alphanumeric (structural, not b58)|
%   | `'`  | not alphanumeric (sophia id quirk)    |
%
%   When any of these appear AFTER at least one valid
%   base58 char in a `ak_`/`ct_`/`sg_` prefixed
%   identifier, `gsc` splits what `so_scan` sees as one
%   `id` token into 2+ gsc tokens.
%
%   **No split if non-base58 char is immediately after
%   `_`**: `smr_plus` requires >=1 base58 char to
%   match; `ak_I`, `ak_0`, `ak__bar` all fall
%   through to `id` and both tokenizers agree.
ken_barson_rises(AkTokAcc = #tk{str = AkStr},
                 SrcTokens = [#tk{shape   = CandidateType,
                                  str = CandidateString}
                             | Rest]) ->
    % candidate:
    % dig out the token type and the string
    Smash = lists:member(CandidateType, smash_types()),
    Pass = lists:member(CandidateType, pass_types()),
    % sanity check
    true = Smash or Pass,
    if
        Smash ->
            % dig out the token from LcTokApi
            NewAkStr    = AkStr ++ CandidateString,
            NewAkTokAcc = AkTokAcc#tk{str = NewAkStr},
            ken_barson_rises(NewAkTokAcc, Rest);
        Pass ->
            {AkTokAcc, SrcTokens}
    end;
ken_barson_rises(Done, []) ->
    {Done, []}.

smash_types() ->
    [char,      % ak_GH'a'  -> {char, "'a'"}
     int16,     % ak_GH0xAB -> {int16, "0xAB"}
     int10,     % ak_GH0123 -> {int10, "0123"}
     tvar,      % ak_GH'a   -> {tvar, "'a"}
     kwd,       % ak_GHlet  -> {kwd, "let"}
     id,        % ak_GH_AB  -> {id, "_AB"}
     con].      % ak_GHI    -> {con, "I"}

pass_types() ->
    % why each of these are impossible
    % meaning the prefix for each of these will cause
    % so_scan to break out of consuming an id, or will
    % never be a disjoined
    % neighbor
    [lcom,      % ak_AB//           breaks out of id
     bcom,      % ak_AB/*           breaks out of id
     ws,        % ak_AB\t           breaks out of id
     punct,     % ak_AB{            breaks out of id
     string,    % ak_AB"            breaks out of id
     bytes,     % ak_AB#            breaks out of id
     ak,ct,sg,  % ak_ABak           [akctsg] all in base58 alphabet
     qid,       % ak_ABI.Am.A.qid   ??? maybe sophia lexes this to [{id, _}, '.']?
     qcon,      % ak_ABI.Am.A.QCon  ??? same
     op].       % ak_AB=<           [=!<>+-*/:&|?~@^] break out of id


-spec to_so_token(GscToken) -> MaybeSoToken
    when GscToken     :: tk(),
         MaybeSoToken :: {true, SoToken}
                       | false,
         SoToken      :: so_token().

% @private
% does NOT handle ak/ct/sg because these may consume
% follow-on tokens
% @end

to_so_token(#tk{shape = SfTokenType,
                pos   = Pos,
                str   = SfTokenStr}) ->
    case SfTokenType of
        %-----------------
        % Ignored
        %-----------------
        bcom -> false;
        lcom -> false;
        ws   -> false;
        %-----------------------
        % {_, _}
        %
        % {contract, {420, 69}}
        %-----------------------
        % kwds ops and punct are all collapsed by
        % so_scan:scan down to eg {'contract', {420, 69}}
        % where {420, 69} is the source location
        % these are three different parsers
        Sym when Sym =:= kwd;
                 Sym =:= op;
                 Sym =:= punct ->
            Symbol = list_to_atom(SfTokenStr),
            {true, {Symbol, Pos}};
        %------------------------------------
        % {_, _, _}
        %
        % {id, {420, 69}, "foo"}
        %--------------------------------
        QVar when QVar =:= qid; QVar =:= qcon ->
            % qualifieds tokenize to
            % {qid, {420, 69}, ["Foo", "Bar", "baz"]}
            {true, {QVar, Pos, string:tokens(SfTokenStr, ".")}};
        SfVar when SfVar =:= id; SfVar =:= con; SfVar =:= tvar ->
            {true, {SfVar, Pos, SfTokenStr}};
        % literals
        % from so_scan:
        % {CHAR,   token(char,   fun parse_char/1)}
        % {STRING, token(string, fun parse_string/1)}
        % {HEX,    token(hex,    fun parse_hex/1)}
        % {INT,    token(int,    fun parse_int/1)}
        % {BYTES,  token(bytes,  fun parse_bytes/1)}
        % so_scan casts strings to binary
        char    -> {true, {char,   Pos, so_parse_char(SfTokenStr)}};
        string  -> {true, {string, Pos, so_parse_string(SfTokenStr)}};
        int16   -> {true, {hex,    Pos, so_parse_hex(SfTokenStr)}};
        int10   -> {true, {int,    Pos, so_parse_int(SfTokenStr)}};
        bytes   -> {true, {bytes,  Pos, so_parse_bytes(SfTokenStr)}};
        NYI ->
            Msg = io_lib:format("gsc_so_scan:to_so_token/1: unhandled token shape: ~p", [NYI]),
            error(#gsc_err{atom   = nyi,
                           str = Msg})
   end.

%% ak/ct/sg all tokenize to id
%% FIXEDME: implement? it seems like so_scan just parses these as
%% identifiers, so not clear what the advantage is here?
%%
%% i suppose we'll find out when we write the syntax parser
%%
%% so_scan lexes ak/ct/sg as ids and then parses them as addresses/sigs
%% in the parsing step
%API when API =:= ak;
%         API =:= ct;
%         API =:= sg ->
%    {true, {id, Pos, SfTokenStr}};

% copied from so_scan.erl
so_parse_char([$' | Chars]) ->
    case unicode:characters_to_nfc_list(unescape($', Chars, [])) of
        [Char] -> Char;
        _Bad   ->
            error(#gsc_err{atom   = bad_token,
                           str = "Bad character literal: '" ++ Chars})
    end.

so_parse_string([$" | Chars]) ->
    unicode:characters_to_nfc_binary(unescape(Chars)).

% FIXME: unfuck this shit
%
% this all works in some stupid fucking way because
% so_scan operates on lists of bytes, rather than on
% character-lists. So single codepoints have to be
% converted to multi-byte sequences or some shit. We're
% always working on lists, so this can probably be
% simplified. I don't care enough at the moment to fix
% this, but this function has been the source of
% several annoying bugs
unescape(Str) -> unescape($", Str, []).

unescape(Delim, [Delim], Acc) ->
    unicode:characters_to_binary(lists:reverse(Acc));
unescape(Delim, [$\\, $x, ${ | Chars ], Acc) ->
    {Ds, [_ | Cs]} = lists:splitwith(fun($}) -> false ; (_) -> true end, Chars),
    C = list_to_integer(Ds, 16),
    Utf8Cs = unicode:characters_to_nfc_list([C]),
    unescape(Delim, Cs, [Utf8Cs | Acc]);
unescape(Delim, [$\\, $x, D1, D2 | Chars ], Acc) ->
    C = list_to_integer([D1, D2], 16),
    Utf8Cs = unicode:characters_to_nfc_list([C]),
    unescape(Delim, Chars, [Utf8Cs | Acc]);
unescape(Delim, [$\\, Code | Chars], Acc) ->
    Ok = fun(C) -> unescape(Delim, Chars, [C | Acc]) end,
    case Code of
        Delim -> Ok(Delim);
        $\\ -> Ok($\\);
        $b  -> Ok($\b);
        $e  -> Ok($\e);
        $f  -> Ok($\f);
        $n  -> Ok($\n);
        $r  -> Ok($\r);
        $t  -> Ok($\t);
        $v  -> Ok($\v);
        _   -> error(#gsc_err{atom   = bad_escape_char,
                              str = "Bad control sequence: \\" ++ [Code]})  %% TODO
    end;
unescape(Delim, [C | Chars], Acc) ->
    unescape(Delim, Chars, [C | Acc]).


so_parse_hex("0x" ++ S) ->
    list_to_integer(strip_underscores(S), 16).

so_parse_int(S) ->
    list_to_integer(strip_underscores(S)).

so_parse_bytes("#" ++ S0) ->
    S      = strip_underscores(S0),
    N      = list_to_integer(S, 16),
    Digits = (length(S) + 1) div 2,
    <<N:Digits/unit:8>>.

strip_underscores(S) ->
    lists:filter(fun(C) -> C /= $_ end, S).