wip name cleanups

2026-06-01 18:00:37 -07:00
parent f548c7d88d
commit 9da6dbf18d
12 changed files with 804 additions and 281 deletions
@@ -0,0 +1,457 @@
+% @doc compatibility layer to test against so_scan
+%
+% converts gsc_tokens data to so_scan tokens
+%
+% Ref: so_scan.erl
+-module(gso_scan).
+
+-export_type([
+    so_kwd/0,
+    so_special_char/0,
+    so_symbol/0,
+    so_token2/0,
+    so_token3/0,
+    so_token/0
+]).
+
+-export([
+    scan/1,
+    ken_barson_rises/2
+]).
+
+-include("$gsc_include/gsc.hrl").
+
+%================================
+% API: types
+%================================
+
+% FIXME: single-quote all the atoms to future-proof against the elixir retards
+% adding more keywords to erlang
+-type so_kwd() :: contract
+                | include
+                | 'let'
+                | switch
+                | type
+                | record
+                | datatype
+                | 'if'
+                | elif
+                | 'else'
+                | function
+                | stateful
+                | payable
+                | 'true'
+                | 'false'
+                | mod
+                | public
+                | entrypoint
+                | private
+                | indexed
+                | namespace
+                | interface
+                | main
+                | using
+                | as
+                | for
+                | hiding
+                | 'band'
+                | 'bor'
+                | 'bxor'
+                | 'bnot'.
+
+-type so_special_char() :: '..'
+                         | ','
+                         | '.'
+                         | ';'
+                         | '('
+                         | ')'
+                         | '['
+                         | ']'
+                         | '{'
+                         | '}'
+                         .
+
+% @doc bad type... essentially a string that is the outcome of a regex match is
+% cast to an atom, and that's the type that goes here
+-type so_symbol() :: so_kwd() | so_special_char() | atom().
+
+-type so_token2() :: {Symbol   :: so_symbol(),
+                      Location :: gsc_pos()}.
+
+% FIXME
+% this is 'id', 'con', qid
+-type so_tk3type() :: char | string | hex | int | bytes | qid | qcon | tvar | id | con.
+
+-type so_token3() :: {TokenType  :: so_tk3type(),
+                      Location   :: gsc_pos(),
+                      TokenValue :: term()}.
+
+-type so_token() :: so_token2() | so_token3().
+
+
+%================================
+% API: functions
+%================================
+
+-spec scan(SrcStr) -> {ok, SoTokens} | {error, gsc_err()}
+    when SrcStr   :: iolist(),
+         SoTokens :: [so_token()].
+% @doc
+% this is meant to agree with so_scan:scan/1 in all cases
+%
+% this converts gsc's internal representation of tokens into the format that
+% so_scan outputs
+% @end
+
+scan(SrcStr) ->
+    case gsc_tokens:tokens(SrcStr) of
+        {ok, SfLTokens} ->
+            SoTokens = to_so_tokens(SfLTokens),
+            {ok, SoTokens};
+        % fucking stupid
+        {error, #gsc_err_bcom_unterminated{prev_tokens = SfcTokens}} ->
+            {ok, to_so_tokens(SfcTokens)};
+        Error ->
+            Error
+    end.
+
+
+
+-spec to_so_tokens(SfcTokens) -> SoTokens
+    when SfcTokens    :: [tk()],
+         SoTokens     :: [so_token()].
+
+% @doc
+% most gsc tokens map 1-to-1 with so_tokens. the
+% exception is ak/ct/sg literals. this is a
+% many-to-one-mapping, and therefore ak, sg, ct need to
+% be handled at the list level.
+%
+% the reason is as follows:
+%
+% so_scan lexes ak_ABCD to an id, then at the parsing
+% stage computes the pubkey that corresponds to.
+%
+% as a result, if we have ak_GHI, I is not a valid
+% base58 char, so WE (gsc) end up lexing that as
+%
+%   [{ak, "ak_GH"}, {con, "I"}]
+%
+% and so_scan lexes that as {id, "ak_GHI"}].
+%
+% however we also don't ignore whitespace, so we can
+% tell if this happens because it occurs precisely when
+% an ak/sg/ct is immediately followed by a
+% non-whitespace token. however there could be more
+% than 1 and they can be a variety of different shapes.
+% so we have to greedily consume them back into a
+% single id.
+%
+% bugs in the happy path are trans-features
+%
+% so if we see an ak/ct/sg token, we summon evil ben
+% carson to reconjoin the unconjoined twins
+to_so_tokens([ AkTok = #tk{type = AkCtSg, pos = Pos}
+             | Sheeit])
+        when ak =:= AkCtSg;
+             ct =:= AkCtSg;
+             sg =:= AkCtSg ->
+    {#tk{string = FinalAkStr}, NewSheeit}
+        = ken_barson_rises(AkTok, Sheeit),
+    [{id, Pos, FinalAkStr}| to_so_tokens(NewSheeit)];
+% this part is just lists:filtermap
+to_so_tokens([X | Xs]) ->
+    case to_so_token(X) of
+        false           -> to_so_tokens(Xs);
+        {true, SoToken} -> [SoToken | to_so_tokens(Xs)]
+    end;
+to_so_tokens([]) ->
+    [].
+
+
+
+-spec ken_barson_rises(InitApiToken, SfToks) -> {FinalApiToken, NewSfToks}
+    when InitApiToken  :: tk(),
+         SfToks        :: [tk()],
+         FinalApiToken :: InitApiToken,
+         NewSfToks     :: SfToks.
+% @doc
+%
+%                  .-""""""""""""-.
+%               .-'   .-======-.   '-.
+%             .'     /  .----.  \     '.
+%            /      |  /      \  |      \
+%           |       | |  @  @  | |       |
+%           |       | |   __   | |       |
+%           |       | |  /@@\  | |       |
+%            \      | |  \__/  | |      /
+%             '.    |  \_++++_/  |    .'
+%               '-._|   |\/\/|   |_.-'
+%                   |   |/\/\|   |
+%                   |   \____/   |
+%                ___|  BEN CARSON |___
+%             .-'   |  HAS BECOME |   '-.
+%            /      |  TOO POWERFUL|      \
+%           /       |______________|       \
+%          /      .-'''-.      .-'''-.      \
+%         |      /  .-.  \    /  .-.  \      |
+%         |     |  (   )  |  |  (   )  |     |
+%         |      \  '-'  /    \  '-'  /      |
+%          \      '-...-'  /\  '-...-'      /
+%           '._           /  \           _.'
+%              '-._____.-'    '-._____.-'
+%
+%              THE SOFT-SPOKEN DOOM DOCTOR
+%               “I prescribed… CHAOS.”
+%
+%         BUGS IN THE HAPPY PATH ARE features.
+%         BUGS IN THE HAPPY PATH ARE features.
+%
+%                  WE LIKE features.
+%
+%               features MAKE US MONEY.
+%
+%                features ARE NOT FOOD.
+%                features ARE friends.
+% @end
+
+
+
+% This function takes the unconjoined twins (e.g.
+% `ak_GHI` lexed to `ak_GH` followed by `I`) and
+% recursively reconjoins them so they can all live
+% happily together as a single so_scan token which will
+% fail in the parsing step.
+%
+% on account of the property that the concatenation of
+% all the token strings equals the original source file
+% (FIXME: should test this in test suite)
+%
+% basically this looks at the next token, and if it's a
+% type that so_scan is going to consume as part of an
+% `id` token, then we add it to the stack.
+%
+% quoth claude:
+%   so_scan lexes identifiers with
+%   /[a-z_][a-zA-Z0-9_']*/. The base58 alphabet used by
+%   `smr_apistr58` is:
+%
+%       123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz
+%
+%   Characters **valid in a so_scan id tail** but
+%   **absent from base58**:
+%
+%   | char | why excluded from base58              |
+%   |------|---------------------------------------|
+%   | `0`  | looks like `O`                        |
+%   | `I`  | looks like `1` or `l`                 |
+%   | `O`  | looks like `0`                        |
+%   | `l`  | looks like `1` or `I`                 |
+%   | `_`  | not alphanumeric (structural, not b58)|
+%   | `'`  | not alphanumeric (sophia id quirk)    |
+%
+%   When any of these appear AFTER at least one valid
+%   base58 char in a `ak_`/`ct_`/`sg_` prefixed
+%   identifier, `gsc` splits what `so_scan` sees as one
+%   `id` token into 2+ gsc tokens.
+%
+%   **No split if non-base58 char is immediately after
+%   `_`**: `smr_plus` requires >=1 base58 char to
+%   match; `ak_I`, `ak_0`, `ak__bar` all fall
+%   through to `id` and both tokenizers agree.
+ken_barson_rises(AkTokAcc = #tk{string = AkStr},
+                 SrcTokens = [#tk{type   = CandidateType,
+                                         string = CandidateString}
+                             | Rest]) ->
+    % candidate:
+    % dig out the token type and the string
+    Smash = lists:member(CandidateType, smash_types()),
+    Pass = lists:member(CandidateType, pass_types()),
+    % sanity check
+    true = Smash or Pass,
+    if
+        Smash ->
+            % dig out the token from LcTokApi
+            NewAkStr    = AkStr ++ CandidateString,
+            NewAkTokAcc = AkTokAcc#tk{string = NewAkStr},
+            ken_barson_rises(NewAkTokAcc, Rest);
+        Pass ->
+            {AkTokAcc, SrcTokens}
+    end;
+ken_barson_rises(Done, []) ->
+    {Done, []}.
+
+smash_types() ->
+    [char,      % ak_GH'a'  -> {char, "'a'"}
+     int16,     % ak_GH0xAB -> {int16, "0xAB"}
+     int10,     % ak_GH0123 -> {int10, "0123"}
+     tvar,      % ak_GH'a   -> {tvar, "'a"}
+     kwd,       % ak_GHlet  -> {kwd, "let"}
+     id,        % ak_GH_AB  -> {id, "_AB"}
+     con].      % ak_GHI    -> {con, "I"}
+
+pass_types() ->
+    % why each of these are impossible
+    % meaning the prefix for each of these will cause
+    % so_scan to break out of consuming an id, or will
+    % never be a disjoined
+    % neighbor
+    [lcom,      % ak_AB//           breaks out of id
+     bcom,      % ak_AB/*           breaks out of id
+     ws,        % ak_AB\t           breaks out of id
+     punct,     % ak_AB{            breaks out of id
+     string,    % ak_AB"            breaks out of id
+     bytes,     % ak_AB#            breaks out of id
+     ak,ct,sg,  % ak_ABak           [akctsg] all in base58 alphabet
+     qid,       % ak_ABI.Am.A.qid   ??? maybe sophia lexes this to [{id, _}, '.']?
+     qcon,      % ak_ABI.Am.A.QCon  ??? same
+     op].       % ak_AB=<           [=!<>+-*/:&|?~@^] break out of id
+
+
+
+-spec to_so_token(SfcToken) -> MaybeSoToken
+    when SfcToken     :: tk(),
+         MaybeSoToken :: {true, SoToken}
+                       | false,
+         SoToken      :: so_token().
+
+% @private
+% does NOT handle ak/ct/sg because these may consume
+% follow-on tokens
+% @end
+
+to_so_token(#tk{type   = SfTokenType,
+                           pos    = Pos,
+                           string = SfTokenStr}) ->
+    case SfTokenType of
+        %-----------------
+        % Ignored
+        %-----------------
+        bcom -> false;
+        lcom -> false;
+        ws   -> false;
+        %-----------------------
+        % {_, _}
+        %
+        % {contract, {420, 69}}
+        %-----------------------
+        % kwds ops and punct are all collapsed by
+        % so_scan:scan down to eg {'contract', {420, 69}}
+        % where {420, 69} is the source location
+        % these are three different parsers
+        Sym when Sym =:= kwd;
+                 Sym =:= op;
+                 Sym =:= punct ->
+            Symbol = list_to_atom(SfTokenStr),
+            {true, {Symbol, Pos}};
+        %------------------------------------
+        % {_, _, _}
+        %
+        % {id, {420, 69}, "foo"}
+        %--------------------------------
+        QVar when QVar =:= qid; QVar =:= qcon ->
+            % qualifieds tokenize to
+            % {qid, {420, 69}, ["Foo", "Bar", "baz"]}
+            {true, {QVar, Pos, string:tokens(SfTokenStr, ".")}};
+        SfVar when SfVar =:= id; SfVar =:= con; SfVar =:= tvar ->
+            {true, {SfVar, Pos, SfTokenStr}};
+        % literals
+        % from so_scan:
+        % {CHAR,   token(char,   fun parse_char/1)}
+        % {STRING, token(string, fun parse_string/1)}
+        % {HEX,    token(hex,    fun parse_hex/1)}
+        % {INT,    token(int,    fun parse_int/1)}
+        % {BYTES,  token(bytes,  fun parse_bytes/1)}
+        % so_scan casts strings to binary
+        char    -> {true, {char,   Pos, so_parse_char(SfTokenStr)}};
+        string  -> {true, {string, Pos, so_parse_string(SfTokenStr)}};
+        int16   -> {true, {hex,    Pos, so_parse_hex(SfTokenStr)}};
+        int10   -> {true, {int,    Pos, so_parse_int(SfTokenStr)}};
+        bytes   -> {true, {bytes,  Pos, so_parse_bytes(SfTokenStr)}};
+        NYI ->
+            Msg = io_lib:format("gsc_so_scan:to_so_token/1: unhandled token shape: ~p", [NYI]),
+            error(#gsc_err{atom   = nyi,
+                           string = Msg})
+   end.
+
+%% ak/ct/sg all tokenize to id
+%% FIXEDME: implement? it seems like so_scan just parses these as
+%% identifiers, so not clear what the advantage is here?
+%%
+%% i suppose we'll find out when we write the syntax parser
+%%
+%% so_scan lexes ak/ct/sg as ids and then parses them as addresses/sigs
+%% in the parsing step
+%API when API =:= ak;
+%         API =:= ct;
+%         API =:= sg ->
+%    {true, {id, Pos, SfTokenStr}};
+
+% copied from so_scan.erl
+so_parse_char([$' | Chars]) ->
+    case unicode:characters_to_nfc_list(unescape($', Chars, [])) of
+        [Char] -> Char;
+        _Bad   ->
+            error(#gsc_err{atom   = bad_token,
+                           string = "Bad character literal: '" ++ Chars})
+    end.
+
+so_parse_string([$" | Chars]) ->
+    unicode:characters_to_nfc_binary(unescape(Chars)).
+
+% FIXME: unfuck this shit
+%
+% this all works in some stupid fucking way because
+% so_scan operates on lists of bytes, rather than on
+% character-lists. So single codepoints have to be
+% converted to multi-byte sequences or some shit. We're
+% always working on lists, so this can probably be
+% simplified. I don't care enough at the moment to fix
+% this, but this function has been the source of
+% several annoying bugs
+unescape(Str) -> unescape($", Str, []).
+
+unescape(Delim, [Delim], Acc) ->
+    unicode:characters_to_binary(lists:reverse(Acc));
+unescape(Delim, [$\\, $x, ${ | Chars ], Acc) ->
+    {Ds, [_ | Cs]} = lists:splitwith(fun($}) -> false ; (_) -> true end, Chars),
+    C = list_to_integer(Ds, 16),
+    Utf8Cs = unicode:characters_to_nfc_list([C]),
+    unescape(Delim, Cs, [Utf8Cs | Acc]);
+unescape(Delim, [$\\, $x, D1, D2 | Chars ], Acc) ->
+    C = list_to_integer([D1, D2], 16),
+    Utf8Cs = unicode:characters_to_nfc_list([C]),
+    unescape(Delim, Chars, [Utf8Cs | Acc]);
+unescape(Delim, [$\\, Code | Chars], Acc) ->
+    Ok = fun(C) -> unescape(Delim, Chars, [C | Acc]) end,
+    case Code of
+        Delim -> Ok(Delim);
+        $\\ -> Ok($\\);
+        $b  -> Ok($\b);
+        $e  -> Ok($\e);
+        $f  -> Ok($\f);
+        $n  -> Ok($\n);
+        $r  -> Ok($\r);
+        $t  -> Ok($\t);
+        $v  -> Ok($\v);
+        _   -> error(#gsc_err{atom   = bad_escape_char,
+                              string = "Bad control sequence: \\" ++ [Code]})  %% TODO
+    end;
+unescape(Delim, [C | Chars], Acc) ->
+    unescape(Delim, Chars, [C | Acc]).
+
+
+so_parse_hex("0x" ++ S) ->
+    list_to_integer(strip_underscores(S), 16).
+
+so_parse_int(S) ->
+    list_to_integer(strip_underscores(S)).
+
+so_parse_bytes("#" ++ S0) ->
+    S      = strip_underscores(S0),
+    N      = list_to_integer(S, 16),
+    Digits = (length(S) + 1) div 2,
+    <<N:Digits/unit:8>>.
+
+strip_underscores(S) ->
+    lists:filter(fun(C) -> C /= $_ end, S).