458 lines
14 KiB
Erlang
458 lines
14 KiB
Erlang
% @doc compatibility layer to test against so_scan
|
|
%
|
|
% converts gsc_tokens data to so_scan tokens
|
|
%
|
|
% Ref: so_scan.erl
|
|
-module(gso_scan).
|
|
|
|
-export_type([
|
|
so_kwd/0,
|
|
so_special_char/0,
|
|
so_symbol/0,
|
|
so_token2/0,
|
|
so_token3/0,
|
|
so_token/0
|
|
]).
|
|
|
|
-export([
|
|
scan/1,
|
|
ken_barson_rises/2
|
|
]).
|
|
|
|
-include("$gsc_include/gsc.hrl").
|
|
|
|
%================================
|
|
% API: types
|
|
%================================
|
|
|
|
% FIXME: single-quote all the atoms to future-proof against the elixir retards
|
|
% adding more keywords to erlang
|
|
-type so_kwd() :: contract
|
|
| include
|
|
| 'let'
|
|
| switch
|
|
| type
|
|
| record
|
|
| datatype
|
|
| 'if'
|
|
| elif
|
|
| 'else'
|
|
| function
|
|
| stateful
|
|
| payable
|
|
| 'true'
|
|
| 'false'
|
|
| mod
|
|
| public
|
|
| entrypoint
|
|
| private
|
|
| indexed
|
|
| namespace
|
|
| interface
|
|
| main
|
|
| using
|
|
| as
|
|
| for
|
|
| hiding
|
|
| 'band'
|
|
| 'bor'
|
|
| 'bxor'
|
|
| 'bnot'.
|
|
|
|
-type so_special_char() :: '..'
|
|
| ','
|
|
| '.'
|
|
| ';'
|
|
| '('
|
|
| ')'
|
|
| '['
|
|
| ']'
|
|
| '{'
|
|
| '}'
|
|
.
|
|
|
|
% @doc bad type... essentially a string that is the outcome of a regex match is
|
|
% cast to an atom, and that's the type that goes here
|
|
-type so_symbol() :: so_kwd() | so_special_char() | atom().
|
|
|
|
-type so_token2() :: {Symbol :: so_symbol(),
|
|
Location :: tk_pos()}.
|
|
|
|
% FIXME
|
|
% this is 'id', 'con', qid
|
|
-type so_tk3type() :: char | string | hex | int | bytes | qid | qcon | tvar | id | con.
|
|
|
|
-type so_token3() :: {TokenType :: so_tk3type(),
|
|
Location :: tk_pos(),
|
|
TokenValue :: term()}.
|
|
|
|
-type so_token() :: so_token2() | so_token3().
|
|
|
|
|
|
%================================
|
|
% API: functions
|
|
%================================
|
|
|
|
-spec scan(SrcStr) -> {ok, SoTokens} | {error, gsc_err()}
|
|
when SrcStr :: iolist(),
|
|
SoTokens :: [so_token()].
|
|
% @doc
|
|
% this is meant to agree with so_scan:scan/1 in all cases
|
|
%
|
|
% this converts gsc's internal representation of tokens into the format that
|
|
% so_scan outputs
|
|
% @end
|
|
|
|
scan(SrcStr) ->
|
|
case gsc_tokens:tokens(SrcStr) of
|
|
{ok, SfLTokens} ->
|
|
SoTokens = to_so_tokens(SfLTokens),
|
|
{ok, SoTokens};
|
|
% fucking stupid
|
|
{error, #gsc_err_bcom_unterminated{prev_tokens = GscTokens}} ->
|
|
{ok, to_so_tokens(GscTokens)};
|
|
Error ->
|
|
Error
|
|
end.
|
|
|
|
|
|
|
|
-spec to_so_tokens(GscTokens) -> SoTokens
|
|
when GscTokens :: [tk()],
|
|
SoTokens :: [so_token()].
|
|
|
|
% @doc
|
|
% most gsc tokens map 1-to-1 with so_tokens. the
|
|
% exception is ak/ct/sg literals. this is a
|
|
% many-to-one-mapping, and therefore ak, sg, ct need to
|
|
% be handled at the list level.
|
|
%
|
|
% the reason is as follows:
|
|
%
|
|
% so_scan lexes ak_ABCD to an id, then at the parsing
|
|
% stage computes the pubkey that corresponds to.
|
|
%
|
|
% as a result, if we have ak_GHI, I is not a valid
|
|
% base58 char, so WE (gsc) end up lexing that as
|
|
%
|
|
% [{ak, "ak_GH"}, {con, "I"}]
|
|
%
|
|
% and so_scan lexes that as {id, "ak_GHI"}].
|
|
%
|
|
% however we also don't ignore whitespace, so we can
|
|
% tell if this happens because it occurs precisely when
|
|
% an ak/sg/ct is immediately followed by a
|
|
% non-whitespace token. however there could be more
|
|
% than 1 and they can be a variety of different shapes.
|
|
% so we have to greedily consume them back into a
|
|
% single id.
|
|
%
|
|
% bugs in the happy path are trans-features
|
|
%
|
|
% so if we see an ak/ct/sg token, we summon evil ben
|
|
% carson to reconjoin the unconjoined twins
|
|
to_so_tokens([ AkTok = #tk{shape = AkCtSg, pos = Pos}
|
|
| Sheeit])
|
|
when ak =:= AkCtSg;
|
|
ct =:= AkCtSg;
|
|
sg =:= AkCtSg ->
|
|
{#tk{str = FinalAkStr}, NewSheeit}
|
|
= ken_barson_rises(AkTok, Sheeit),
|
|
[{id, Pos, FinalAkStr}| to_so_tokens(NewSheeit)];
|
|
% this part is just lists:filtermap
|
|
to_so_tokens([X | Xs]) ->
|
|
case to_so_token(X) of
|
|
false -> to_so_tokens(Xs);
|
|
{true, SoToken} -> [SoToken | to_so_tokens(Xs)]
|
|
end;
|
|
to_so_tokens([]) ->
|
|
[].
|
|
|
|
|
|
|
|
-spec ken_barson_rises(InitApiToken, SfToks) -> {FinalApiToken, NewSfToks}
|
|
when InitApiToken :: tk(),
|
|
SfToks :: [tk()],
|
|
FinalApiToken :: InitApiToken,
|
|
NewSfToks :: SfToks.
|
|
% @doc
|
|
%
|
|
% .-""""""""""""-.
|
|
% .-' .-======-. '-.
|
|
% .' / .----. \ '.
|
|
% / | / \ | \
|
|
% | | | @ @ | | |
|
|
% | | | __ | | |
|
|
% | | | /@@\ | | |
|
|
% \ | | \__/ | | /
|
|
% '. | \_++++_/ | .'
|
|
% '-._| |\/\/| |_.-'
|
|
% | |/\/\| |
|
|
% | \____/ |
|
|
% ___| BEN CARSON |___
|
|
% .-' | HAS BECOME | '-.
|
|
% / | TOO POWERFUL| \
|
|
% / |______________| \
|
|
% / .-'''-. .-'''-. \
|
|
% | / .-. \ / .-. \ |
|
|
% | | ( ) | | ( ) | |
|
|
% | \ '-' / \ '-' / |
|
|
% \ '-...-' /\ '-...-' /
|
|
% '._ / \ _.'
|
|
% '-._____.-' '-._____.-'
|
|
%
|
|
% THE SOFT-SPOKEN DOOM DOCTOR
|
|
% “I prescribed… CHAOS.”
|
|
%
|
|
% BUGS IN THE HAPPY PATH ARE features.
|
|
% BUGS IN THE HAPPY PATH ARE features.
|
|
%
|
|
% WE LIKE features.
|
|
%
|
|
% features MAKE US MONEY.
|
|
%
|
|
% features ARE NOT FOOD.
|
|
% features ARE friends.
|
|
% @end
|
|
|
|
|
|
|
|
% This function takes the unconjoined twins (e.g.
|
|
% `ak_GHI` lexed to `ak_GH` followed by `I`) and
|
|
% recursively reconjoins them so they can all live
|
|
% happily together as a single so_scan token which will
|
|
% fail in the parsing step.
|
|
%
|
|
% on account of the property that the concatenation of
|
|
% all the token strings equals the original source file
|
|
% (FIXME: should test this in test suite)
|
|
%
|
|
% basically this looks at the next token, and if it's a
|
|
% type that so_scan is going to consume as part of an
|
|
% `id` token, then we add it to the stack.
|
|
%
|
|
% quoth claude:
|
|
% so_scan lexes identifiers with
|
|
% /[a-z_][a-zA-Z0-9_']*/. The base58 alphabet used by
|
|
% `smr_apistr58` is:
|
|
%
|
|
% 123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz
|
|
%
|
|
% Characters **valid in a so_scan id tail** but
|
|
% **absent from base58**:
|
|
%
|
|
% | char | why excluded from base58 |
|
|
% |------|---------------------------------------|
|
|
% | `0` | looks like `O` |
|
|
% | `I` | looks like `1` or `l` |
|
|
% | `O` | looks like `0` |
|
|
% | `l` | looks like `1` or `I` |
|
|
% | `_` | not alphanumeric (structural, not b58)|
|
|
% | `'` | not alphanumeric (sophia id quirk) |
|
|
%
|
|
% When any of these appear AFTER at least one valid
|
|
% base58 char in a `ak_`/`ct_`/`sg_` prefixed
|
|
% identifier, `gsc` splits what `so_scan` sees as one
|
|
% `id` token into 2+ gsc tokens.
|
|
%
|
|
% **No split if non-base58 char is immediately after
|
|
% `_`**: `smr_plus` requires >=1 base58 char to
|
|
% match; `ak_I`, `ak_0`, `ak__bar` all fall
|
|
% through to `id` and both tokenizers agree.
|
|
ken_barson_rises(AkTokAcc = #tk{str = AkStr},
|
|
SrcTokens = [#tk{shape = CandidateType,
|
|
str = CandidateString}
|
|
| Rest]) ->
|
|
% candidate:
|
|
% dig out the token type and the string
|
|
Smash = lists:member(CandidateType, smash_types()),
|
|
Pass = lists:member(CandidateType, pass_types()),
|
|
% sanity check
|
|
true = Smash or Pass,
|
|
if
|
|
Smash ->
|
|
% dig out the token from LcTokApi
|
|
NewAkStr = AkStr ++ CandidateString,
|
|
NewAkTokAcc = AkTokAcc#tk{str = NewAkStr},
|
|
ken_barson_rises(NewAkTokAcc, Rest);
|
|
Pass ->
|
|
{AkTokAcc, SrcTokens}
|
|
end;
|
|
ken_barson_rises(Done, []) ->
|
|
{Done, []}.
|
|
|
|
smash_types() ->
|
|
[char, % ak_GH'a' -> {char, "'a'"}
|
|
int16, % ak_GH0xAB -> {int16, "0xAB"}
|
|
int10, % ak_GH0123 -> {int10, "0123"}
|
|
tvar, % ak_GH'a -> {tvar, "'a"}
|
|
kwd, % ak_GHlet -> {kwd, "let"}
|
|
id, % ak_GH_AB -> {id, "_AB"}
|
|
con]. % ak_GHI -> {con, "I"}
|
|
|
|
pass_types() ->
|
|
% why each of these are impossible
|
|
% meaning the prefix for each of these will cause
|
|
% so_scan to break out of consuming an id, or will
|
|
% never be a disjoined
|
|
% neighbor
|
|
[lcom, % ak_AB// breaks out of id
|
|
bcom, % ak_AB/* breaks out of id
|
|
ws, % ak_AB\t breaks out of id
|
|
punct, % ak_AB{ breaks out of id
|
|
string, % ak_AB" breaks out of id
|
|
bytes, % ak_AB# breaks out of id
|
|
ak,ct,sg, % ak_ABak [akctsg] all in base58 alphabet
|
|
qid, % ak_ABI.Am.A.qid ??? maybe sophia lexes this to [{id, _}, '.']?
|
|
qcon, % ak_ABI.Am.A.QCon ??? same
|
|
op]. % ak_AB=< [=!<>+-*/:&|?~@^] break out of id
|
|
|
|
|
|
|
|
-spec to_so_token(GscToken) -> MaybeSoToken
|
|
when GscToken :: tk(),
|
|
MaybeSoToken :: {true, SoToken}
|
|
| false,
|
|
SoToken :: so_token().
|
|
|
|
% @private
|
|
% does NOT handle ak/ct/sg because these may consume
|
|
% follow-on tokens
|
|
% @end
|
|
|
|
to_so_token(#tk{shape = SfTokenType,
|
|
pos = Pos,
|
|
str = SfTokenStr}) ->
|
|
case SfTokenType of
|
|
%-----------------
|
|
% Ignored
|
|
%-----------------
|
|
bcom -> false;
|
|
lcom -> false;
|
|
ws -> false;
|
|
%-----------------------
|
|
% {_, _}
|
|
%
|
|
% {contract, {420, 69}}
|
|
%-----------------------
|
|
% kwds ops and punct are all collapsed by
|
|
% so_scan:scan down to eg {'contract', {420, 69}}
|
|
% where {420, 69} is the source location
|
|
% these are three different parsers
|
|
Sym when Sym =:= kwd;
|
|
Sym =:= op;
|
|
Sym =:= punct ->
|
|
Symbol = list_to_atom(SfTokenStr),
|
|
{true, {Symbol, Pos}};
|
|
%------------------------------------
|
|
% {_, _, _}
|
|
%
|
|
% {id, {420, 69}, "foo"}
|
|
%--------------------------------
|
|
QVar when QVar =:= qid; QVar =:= qcon ->
|
|
% qualifieds tokenize to
|
|
% {qid, {420, 69}, ["Foo", "Bar", "baz"]}
|
|
{true, {QVar, Pos, string:tokens(SfTokenStr, ".")}};
|
|
SfVar when SfVar =:= id; SfVar =:= con; SfVar =:= tvar ->
|
|
{true, {SfVar, Pos, SfTokenStr}};
|
|
% literals
|
|
% from so_scan:
|
|
% {CHAR, token(char, fun parse_char/1)}
|
|
% {STRING, token(string, fun parse_string/1)}
|
|
% {HEX, token(hex, fun parse_hex/1)}
|
|
% {INT, token(int, fun parse_int/1)}
|
|
% {BYTES, token(bytes, fun parse_bytes/1)}
|
|
% so_scan casts strings to binary
|
|
char -> {true, {char, Pos, so_parse_char(SfTokenStr)}};
|
|
string -> {true, {string, Pos, so_parse_string(SfTokenStr)}};
|
|
int16 -> {true, {hex, Pos, so_parse_hex(SfTokenStr)}};
|
|
int10 -> {true, {int, Pos, so_parse_int(SfTokenStr)}};
|
|
bytes -> {true, {bytes, Pos, so_parse_bytes(SfTokenStr)}};
|
|
NYI ->
|
|
Msg = io_lib:format("gsc_so_scan:to_so_token/1: unhandled token shape: ~p", [NYI]),
|
|
error(#gsc_err{atom = nyi,
|
|
str = Msg})
|
|
end.
|
|
|
|
%% ak/ct/sg all tokenize to id
|
|
%% FIXEDME: implement? it seems like so_scan just parses these as
|
|
%% identifiers, so not clear what the advantage is here?
|
|
%%
|
|
%% i suppose we'll find out when we write the syntax parser
|
|
%%
|
|
%% so_scan lexes ak/ct/sg as ids and then parses them as addresses/sigs
|
|
%% in the parsing step
|
|
%API when API =:= ak;
|
|
% API =:= ct;
|
|
% API =:= sg ->
|
|
% {true, {id, Pos, SfTokenStr}};
|
|
|
|
% copied from so_scan.erl
|
|
so_parse_char([$' | Chars]) ->
|
|
case unicode:characters_to_nfc_list(unescape($', Chars, [])) of
|
|
[Char] -> Char;
|
|
_Bad ->
|
|
error(#gsc_err{atom = bad_token,
|
|
str = "Bad character literal: '" ++ Chars})
|
|
end.
|
|
|
|
so_parse_string([$" | Chars]) ->
|
|
unicode:characters_to_nfc_binary(unescape(Chars)).
|
|
|
|
% FIXME: unfuck this shit
|
|
%
|
|
% this all works in some stupid fucking way because
|
|
% so_scan operates on lists of bytes, rather than on
|
|
% character-lists. So single codepoints have to be
|
|
% converted to multi-byte sequences or some shit. We're
|
|
% always working on lists, so this can probably be
|
|
% simplified. I don't care enough at the moment to fix
|
|
% this, but this function has been the source of
|
|
% several annoying bugs
|
|
unescape(Str) -> unescape($", Str, []).
|
|
|
|
unescape(Delim, [Delim], Acc) ->
|
|
unicode:characters_to_binary(lists:reverse(Acc));
|
|
unescape(Delim, [$\\, $x, ${ | Chars ], Acc) ->
|
|
{Ds, [_ | Cs]} = lists:splitwith(fun($}) -> false ; (_) -> true end, Chars),
|
|
C = list_to_integer(Ds, 16),
|
|
Utf8Cs = unicode:characters_to_nfc_list([C]),
|
|
unescape(Delim, Cs, [Utf8Cs | Acc]);
|
|
unescape(Delim, [$\\, $x, D1, D2 | Chars ], Acc) ->
|
|
C = list_to_integer([D1, D2], 16),
|
|
Utf8Cs = unicode:characters_to_nfc_list([C]),
|
|
unescape(Delim, Chars, [Utf8Cs | Acc]);
|
|
unescape(Delim, [$\\, Code | Chars], Acc) ->
|
|
Ok = fun(C) -> unescape(Delim, Chars, [C | Acc]) end,
|
|
case Code of
|
|
Delim -> Ok(Delim);
|
|
$\\ -> Ok($\\);
|
|
$b -> Ok($\b);
|
|
$e -> Ok($\e);
|
|
$f -> Ok($\f);
|
|
$n -> Ok($\n);
|
|
$r -> Ok($\r);
|
|
$t -> Ok($\t);
|
|
$v -> Ok($\v);
|
|
_ -> error(#gsc_err{atom = bad_escape_char,
|
|
str = "Bad control sequence: \\" ++ [Code]}) %% TODO
|
|
end;
|
|
unescape(Delim, [C | Chars], Acc) ->
|
|
unescape(Delim, Chars, [C | Acc]).
|
|
|
|
|
|
so_parse_hex("0x" ++ S) ->
|
|
list_to_integer(strip_underscores(S), 16).
|
|
|
|
so_parse_int(S) ->
|
|
list_to_integer(strip_underscores(S)).
|
|
|
|
so_parse_bytes("#" ++ S0) ->
|
|
S = strip_underscores(S0),
|
|
N = list_to_integer(S, 16),
|
|
Digits = (length(S) + 1) div 2,
|
|
<<N:Digits/unit:8>>.
|
|
|
|
strip_underscores(S) ->
|
|
lists:filter(fun(C) -> C /= $_ end, S).
|