Files
gsc/src/gso_scan.erl
T
Peter Harpending 4f4adaa284 stopping point
2026-06-02 16:51:05 -07:00

458 lines
14 KiB
Erlang

% @doc compatibility layer to test against so_scan
%
% converts gsc_tokens data to so_scan tokens
%
% Ref: so_scan.erl
-module(gso_scan).
-export_type([
so_kwd/0,
so_special_char/0,
so_symbol/0,
so_token2/0,
so_token3/0,
so_token/0
]).
-export([
scan/1,
ken_barson_rises/2
]).
-include("$gsc_include/gsc.hrl").
%================================
% API: types
%================================
% FIXME: single-quote all the atoms to future-proof against the elixir retards
% adding more keywords to erlang
-type so_kwd() :: contract
| include
| 'let'
| switch
| type
| record
| datatype
| 'if'
| elif
| 'else'
| function
| stateful
| payable
| 'true'
| 'false'
| mod
| public
| entrypoint
| private
| indexed
| namespace
| interface
| main
| using
| as
| for
| hiding
| 'band'
| 'bor'
| 'bxor'
| 'bnot'.
-type so_special_char() :: '..'
| ','
| '.'
| ';'
| '('
| ')'
| '['
| ']'
| '{'
| '}'
.
% @doc bad type... essentially a string that is the outcome of a regex match is
% cast to an atom, and that's the type that goes here
-type so_symbol() :: so_kwd() | so_special_char() | atom().
-type so_token2() :: {Symbol :: so_symbol(),
Location :: tk_pos()}.
% FIXME
% this is 'id', 'con', qid
-type so_tk3type() :: char | string | hex | int | bytes | qid | qcon | tvar | id | con.
-type so_token3() :: {TokenType :: so_tk3type(),
Location :: tk_pos(),
TokenValue :: term()}.
-type so_token() :: so_token2() | so_token3().
%================================
% API: functions
%================================
-spec scan(SrcStr) -> {ok, SoTokens} | {error, gsc_err()}
when SrcStr :: iolist(),
SoTokens :: [so_token()].
% @doc
% this is meant to agree with so_scan:scan/1 in all cases
%
% this converts gsc's internal representation of tokens into the format that
% so_scan outputs
% @end
scan(SrcStr) ->
case gsc_tokens:tokens(SrcStr) of
{ok, SfLTokens} ->
SoTokens = to_so_tokens(SfLTokens),
{ok, SoTokens};
% fucking stupid
{error, #gsc_err_bcom_unterminated{prev_tokens = GscTokens}} ->
{ok, to_so_tokens(GscTokens)};
Error ->
Error
end.
-spec to_so_tokens(GscTokens) -> SoTokens
when GscTokens :: [tk()],
SoTokens :: [so_token()].
% @doc
% most gsc tokens map 1-to-1 with so_tokens. the
% exception is ak/ct/sg literals. this is a
% many-to-one-mapping, and therefore ak, sg, ct need to
% be handled at the list level.
%
% the reason is as follows:
%
% so_scan lexes ak_ABCD to an id, then at the parsing
% stage computes the pubkey that corresponds to.
%
% as a result, if we have ak_GHI, I is not a valid
% base58 char, so WE (gsc) end up lexing that as
%
% [{ak, "ak_GH"}, {con, "I"}]
%
% and so_scan lexes that as {id, "ak_GHI"}].
%
% however we also don't ignore whitespace, so we can
% tell if this happens because it occurs precisely when
% an ak/sg/ct is immediately followed by a
% non-whitespace token. however there could be more
% than 1 and they can be a variety of different shapes.
% so we have to greedily consume them back into a
% single id.
%
% bugs in the happy path are trans-features
%
% so if we see an ak/ct/sg token, we summon evil ben
% carson to reconjoin the unconjoined twins
to_so_tokens([ AkTok = #tk{shape = AkCtSg, pos = Pos}
| Sheeit])
when ak =:= AkCtSg;
ct =:= AkCtSg;
sg =:= AkCtSg ->
{#tk{str = FinalAkStr}, NewSheeit}
= ken_barson_rises(AkTok, Sheeit),
[{id, Pos, FinalAkStr}| to_so_tokens(NewSheeit)];
% this part is just lists:filtermap
to_so_tokens([X | Xs]) ->
case to_so_token(X) of
false -> to_so_tokens(Xs);
{true, SoToken} -> [SoToken | to_so_tokens(Xs)]
end;
to_so_tokens([]) ->
[].
-spec ken_barson_rises(InitApiToken, SfToks) -> {FinalApiToken, NewSfToks}
when InitApiToken :: tk(),
SfToks :: [tk()],
FinalApiToken :: InitApiToken,
NewSfToks :: SfToks.
% @doc
%
% .-""""""""""""-.
% .-' .-======-. '-.
% .' / .----. \ '.
% / | / \ | \
% | | | @ @ | | |
% | | | __ | | |
% | | | /@@\ | | |
% \ | | \__/ | | /
% '. | \_++++_/ | .'
% '-._| |\/\/| |_.-'
% | |/\/\| |
% | \____/ |
% ___| BEN CARSON |___
% .-' | HAS BECOME | '-.
% / | TOO POWERFUL| \
% / |______________| \
% / .-'''-. .-'''-. \
% | / .-. \ / .-. \ |
% | | ( ) | | ( ) | |
% | \ '-' / \ '-' / |
% \ '-...-' /\ '-...-' /
% '._ / \ _.'
% '-._____.-' '-._____.-'
%
% THE SOFT-SPOKEN DOOM DOCTOR
% “I prescribed… CHAOS.”
%
% BUGS IN THE HAPPY PATH ARE features.
% BUGS IN THE HAPPY PATH ARE features.
%
% WE LIKE features.
%
% features MAKE US MONEY.
%
% features ARE NOT FOOD.
% features ARE friends.
% @end
% This function takes the unconjoined twins (e.g.
% `ak_GHI` lexed to `ak_GH` followed by `I`) and
% recursively reconjoins them so they can all live
% happily together as a single so_scan token which will
% fail in the parsing step.
%
% on account of the property that the concatenation of
% all the token strings equals the original source file
% (FIXME: should test this in test suite)
%
% basically this looks at the next token, and if it's a
% type that so_scan is going to consume as part of an
% `id` token, then we add it to the stack.
%
% quoth claude:
% so_scan lexes identifiers with
% /[a-z_][a-zA-Z0-9_']*/. The base58 alphabet used by
% `smr_apistr58` is:
%
% 123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz
%
% Characters **valid in a so_scan id tail** but
% **absent from base58**:
%
% | char | why excluded from base58 |
% |------|---------------------------------------|
% | `0` | looks like `O` |
% | `I` | looks like `1` or `l` |
% | `O` | looks like `0` |
% | `l` | looks like `1` or `I` |
% | `_` | not alphanumeric (structural, not b58)|
% | `'` | not alphanumeric (sophia id quirk) |
%
% When any of these appear AFTER at least one valid
% base58 char in a `ak_`/`ct_`/`sg_` prefixed
% identifier, `gsc` splits what `so_scan` sees as one
% `id` token into 2+ gsc tokens.
%
% **No split if non-base58 char is immediately after
% `_`**: `smr_plus` requires >=1 base58 char to
% match; `ak_I`, `ak_0`, `ak__bar` all fall
% through to `id` and both tokenizers agree.
ken_barson_rises(AkTokAcc = #tk{str = AkStr},
SrcTokens = [#tk{shape = CandidateType,
str = CandidateString}
| Rest]) ->
% candidate:
% dig out the token type and the string
Smash = lists:member(CandidateType, smash_types()),
Pass = lists:member(CandidateType, pass_types()),
% sanity check
true = Smash or Pass,
if
Smash ->
% dig out the token from LcTokApi
NewAkStr = AkStr ++ CandidateString,
NewAkTokAcc = AkTokAcc#tk{str = NewAkStr},
ken_barson_rises(NewAkTokAcc, Rest);
Pass ->
{AkTokAcc, SrcTokens}
end;
ken_barson_rises(Done, []) ->
{Done, []}.
smash_types() ->
[char, % ak_GH'a' -> {char, "'a'"}
int16, % ak_GH0xAB -> {int16, "0xAB"}
int10, % ak_GH0123 -> {int10, "0123"}
tvar, % ak_GH'a -> {tvar, "'a"}
kwd, % ak_GHlet -> {kwd, "let"}
id, % ak_GH_AB -> {id, "_AB"}
con]. % ak_GHI -> {con, "I"}
pass_types() ->
% why each of these are impossible
% meaning the prefix for each of these will cause
% so_scan to break out of consuming an id, or will
% never be a disjoined
% neighbor
[lcom, % ak_AB// breaks out of id
bcom, % ak_AB/* breaks out of id
ws, % ak_AB\t breaks out of id
punct, % ak_AB{ breaks out of id
string, % ak_AB" breaks out of id
bytes, % ak_AB# breaks out of id
ak,ct,sg, % ak_ABak [akctsg] all in base58 alphabet
qid, % ak_ABI.Am.A.qid ??? maybe sophia lexes this to [{id, _}, '.']?
qcon, % ak_ABI.Am.A.QCon ??? same
op]. % ak_AB=< [=!<>+-*/:&|?~@^] break out of id
-spec to_so_token(GscToken) -> MaybeSoToken
when GscToken :: tk(),
MaybeSoToken :: {true, SoToken}
| false,
SoToken :: so_token().
% @private
% does NOT handle ak/ct/sg because these may consume
% follow-on tokens
% @end
to_so_token(#tk{shape = SfTokenType,
pos = Pos,
str = SfTokenStr}) ->
case SfTokenType of
%-----------------
% Ignored
%-----------------
bcom -> false;
lcom -> false;
ws -> false;
%-----------------------
% {_, _}
%
% {contract, {420, 69}}
%-----------------------
% kwds ops and punct are all collapsed by
% so_scan:scan down to eg {'contract', {420, 69}}
% where {420, 69} is the source location
% these are three different parsers
Sym when Sym =:= kwd;
Sym =:= op;
Sym =:= punct ->
Symbol = list_to_atom(SfTokenStr),
{true, {Symbol, Pos}};
%------------------------------------
% {_, _, _}
%
% {id, {420, 69}, "foo"}
%--------------------------------
QVar when QVar =:= qid; QVar =:= qcon ->
% qualifieds tokenize to
% {qid, {420, 69}, ["Foo", "Bar", "baz"]}
{true, {QVar, Pos, string:tokens(SfTokenStr, ".")}};
SfVar when SfVar =:= id; SfVar =:= con; SfVar =:= tvar ->
{true, {SfVar, Pos, SfTokenStr}};
% literals
% from so_scan:
% {CHAR, token(char, fun parse_char/1)}
% {STRING, token(string, fun parse_string/1)}
% {HEX, token(hex, fun parse_hex/1)}
% {INT, token(int, fun parse_int/1)}
% {BYTES, token(bytes, fun parse_bytes/1)}
% so_scan casts strings to binary
char -> {true, {char, Pos, so_parse_char(SfTokenStr)}};
string -> {true, {string, Pos, so_parse_string(SfTokenStr)}};
int16 -> {true, {hex, Pos, so_parse_hex(SfTokenStr)}};
int10 -> {true, {int, Pos, so_parse_int(SfTokenStr)}};
bytes -> {true, {bytes, Pos, so_parse_bytes(SfTokenStr)}};
NYI ->
Msg = io_lib:format("gsc_so_scan:to_so_token/1: unhandled token shape: ~p", [NYI]),
error(#gsc_err{atom = nyi,
str = Msg})
end.
%% ak/ct/sg all tokenize to id
%% FIXEDME: implement? it seems like so_scan just parses these as
%% identifiers, so not clear what the advantage is here?
%%
%% i suppose we'll find out when we write the syntax parser
%%
%% so_scan lexes ak/ct/sg as ids and then parses them as addresses/sigs
%% in the parsing step
%API when API =:= ak;
% API =:= ct;
% API =:= sg ->
% {true, {id, Pos, SfTokenStr}};
% copied from so_scan.erl
so_parse_char([$' | Chars]) ->
case unicode:characters_to_nfc_list(unescape($', Chars, [])) of
[Char] -> Char;
_Bad ->
error(#gsc_err{atom = bad_token,
str = "Bad character literal: '" ++ Chars})
end.
so_parse_string([$" | Chars]) ->
unicode:characters_to_nfc_binary(unescape(Chars)).
% FIXME: unfuck this shit
%
% this all works in some stupid fucking way because
% so_scan operates on lists of bytes, rather than on
% character-lists. So single codepoints have to be
% converted to multi-byte sequences or some shit. We're
% always working on lists, so this can probably be
% simplified. I don't care enough at the moment to fix
% this, but this function has been the source of
% several annoying bugs
unescape(Str) -> unescape($", Str, []).
unescape(Delim, [Delim], Acc) ->
unicode:characters_to_binary(lists:reverse(Acc));
unescape(Delim, [$\\, $x, ${ | Chars ], Acc) ->
{Ds, [_ | Cs]} = lists:splitwith(fun($}) -> false ; (_) -> true end, Chars),
C = list_to_integer(Ds, 16),
Utf8Cs = unicode:characters_to_nfc_list([C]),
unescape(Delim, Cs, [Utf8Cs | Acc]);
unescape(Delim, [$\\, $x, D1, D2 | Chars ], Acc) ->
C = list_to_integer([D1, D2], 16),
Utf8Cs = unicode:characters_to_nfc_list([C]),
unescape(Delim, Chars, [Utf8Cs | Acc]);
unescape(Delim, [$\\, Code | Chars], Acc) ->
Ok = fun(C) -> unescape(Delim, Chars, [C | Acc]) end,
case Code of
Delim -> Ok(Delim);
$\\ -> Ok($\\);
$b -> Ok($\b);
$e -> Ok($\e);
$f -> Ok($\f);
$n -> Ok($\n);
$r -> Ok($\r);
$t -> Ok($\t);
$v -> Ok($\v);
_ -> error(#gsc_err{atom = bad_escape_char,
str = "Bad control sequence: \\" ++ [Code]}) %% TODO
end;
unescape(Delim, [C | Chars], Acc) ->
unescape(Delim, Chars, [C | Acc]).
so_parse_hex("0x" ++ S) ->
list_to_integer(strip_underscores(S), 16).
so_parse_int(S) ->
list_to_integer(strip_underscores(S)).
so_parse_bytes("#" ++ S0) ->
S = strip_underscores(S0),
N = list_to_integer(S, 16),
Digits = (length(S) + 1) div 2,
<<N:Digits/unit:8>>.
strip_underscores(S) ->
lists:filter(fun(C) -> C /= $_ end, S).