wip name cleanups
This commit is contained in:
@@ -0,0 +1,457 @@
|
||||
% @doc compatibility layer to test against so_scan
|
||||
%
|
||||
% converts gsc_tokens data to so_scan tokens
|
||||
%
|
||||
% Ref: so_scan.erl
|
||||
-module(gso_scan).
|
||||
|
||||
-export_type([
|
||||
so_kwd/0,
|
||||
so_special_char/0,
|
||||
so_symbol/0,
|
||||
so_token2/0,
|
||||
so_token3/0,
|
||||
so_token/0
|
||||
]).
|
||||
|
||||
-export([
|
||||
scan/1,
|
||||
ken_barson_rises/2
|
||||
]).
|
||||
|
||||
-include("$gsc_include/gsc.hrl").
|
||||
|
||||
%================================
|
||||
% API: types
|
||||
%================================
|
||||
|
||||
% FIXME: single-quote all the atoms to future-proof against the elixir retards
|
||||
% adding more keywords to erlang
|
||||
-type so_kwd() :: contract
|
||||
| include
|
||||
| 'let'
|
||||
| switch
|
||||
| type
|
||||
| record
|
||||
| datatype
|
||||
| 'if'
|
||||
| elif
|
||||
| 'else'
|
||||
| function
|
||||
| stateful
|
||||
| payable
|
||||
| 'true'
|
||||
| 'false'
|
||||
| mod
|
||||
| public
|
||||
| entrypoint
|
||||
| private
|
||||
| indexed
|
||||
| namespace
|
||||
| interface
|
||||
| main
|
||||
| using
|
||||
| as
|
||||
| for
|
||||
| hiding
|
||||
| 'band'
|
||||
| 'bor'
|
||||
| 'bxor'
|
||||
| 'bnot'.
|
||||
|
||||
-type so_special_char() :: '..'
|
||||
| ','
|
||||
| '.'
|
||||
| ';'
|
||||
| '('
|
||||
| ')'
|
||||
| '['
|
||||
| ']'
|
||||
| '{'
|
||||
| '}'
|
||||
.
|
||||
|
||||
% @doc bad type... essentially a string that is the outcome of a regex match is
|
||||
% cast to an atom, and that's the type that goes here
|
||||
-type so_symbol() :: so_kwd() | so_special_char() | atom().
|
||||
|
||||
-type so_token2() :: {Symbol :: so_symbol(),
|
||||
Location :: gsc_pos()}.
|
||||
|
||||
% FIXME
|
||||
% this is 'id', 'con', qid
|
||||
-type so_tk3type() :: char | string | hex | int | bytes | qid | qcon | tvar | id | con.
|
||||
|
||||
-type so_token3() :: {TokenType :: so_tk3type(),
|
||||
Location :: gsc_pos(),
|
||||
TokenValue :: term()}.
|
||||
|
||||
-type so_token() :: so_token2() | so_token3().
|
||||
|
||||
|
||||
%================================
|
||||
% API: functions
|
||||
%================================
|
||||
|
||||
-spec scan(SrcStr) -> {ok, SoTokens} | {error, gsc_err()}
|
||||
when SrcStr :: iolist(),
|
||||
SoTokens :: [so_token()].
|
||||
% @doc
|
||||
% this is meant to agree with so_scan:scan/1 in all cases
|
||||
%
|
||||
% this converts gsc's internal representation of tokens into the format that
|
||||
% so_scan outputs
|
||||
% @end
|
||||
|
||||
scan(SrcStr) ->
|
||||
case gsc_tokens:tokens(SrcStr) of
|
||||
{ok, SfLTokens} ->
|
||||
SoTokens = to_so_tokens(SfLTokens),
|
||||
{ok, SoTokens};
|
||||
% fucking stupid
|
||||
{error, #gsc_err_bcom_unterminated{prev_tokens = SfcTokens}} ->
|
||||
{ok, to_so_tokens(SfcTokens)};
|
||||
Error ->
|
||||
Error
|
||||
end.
|
||||
|
||||
|
||||
|
||||
-spec to_so_tokens(SfcTokens) -> SoTokens
|
||||
when SfcTokens :: [tk()],
|
||||
SoTokens :: [so_token()].
|
||||
|
||||
% @doc
|
||||
% most gsc tokens map 1-to-1 with so_tokens. the
|
||||
% exception is ak/ct/sg literals. this is a
|
||||
% many-to-one-mapping, and therefore ak, sg, ct need to
|
||||
% be handled at the list level.
|
||||
%
|
||||
% the reason is as follows:
|
||||
%
|
||||
% so_scan lexes ak_ABCD to an id, then at the parsing
|
||||
% stage computes the pubkey that corresponds to.
|
||||
%
|
||||
% as a result, if we have ak_GHI, I is not a valid
|
||||
% base58 char, so WE (gsc) end up lexing that as
|
||||
%
|
||||
% [{ak, "ak_GH"}, {con, "I"}]
|
||||
%
|
||||
% and so_scan lexes that as {id, "ak_GHI"}].
|
||||
%
|
||||
% however we also don't ignore whitespace, so we can
|
||||
% tell if this happens because it occurs precisely when
|
||||
% an ak/sg/ct is immediately followed by a
|
||||
% non-whitespace token. however there could be more
|
||||
% than 1 and they can be a variety of different shapes.
|
||||
% so we have to greedily consume them back into a
|
||||
% single id.
|
||||
%
|
||||
% bugs in the happy path are trans-features
|
||||
%
|
||||
% so if we see an ak/ct/sg token, we summon evil ben
|
||||
% carson to reconjoin the unconjoined twins
|
||||
to_so_tokens([ AkTok = #tk{type = AkCtSg, pos = Pos}
|
||||
| Sheeit])
|
||||
when ak =:= AkCtSg;
|
||||
ct =:= AkCtSg;
|
||||
sg =:= AkCtSg ->
|
||||
{#tk{string = FinalAkStr}, NewSheeit}
|
||||
= ken_barson_rises(AkTok, Sheeit),
|
||||
[{id, Pos, FinalAkStr}| to_so_tokens(NewSheeit)];
|
||||
% this part is just lists:filtermap
|
||||
to_so_tokens([X | Xs]) ->
|
||||
case to_so_token(X) of
|
||||
false -> to_so_tokens(Xs);
|
||||
{true, SoToken} -> [SoToken | to_so_tokens(Xs)]
|
||||
end;
|
||||
to_so_tokens([]) ->
|
||||
[].
|
||||
|
||||
|
||||
|
||||
-spec ken_barson_rises(InitApiToken, SfToks) -> {FinalApiToken, NewSfToks}
|
||||
when InitApiToken :: tk(),
|
||||
SfToks :: [tk()],
|
||||
FinalApiToken :: InitApiToken,
|
||||
NewSfToks :: SfToks.
|
||||
% @doc
|
||||
%
|
||||
% .-""""""""""""-.
|
||||
% .-' .-======-. '-.
|
||||
% .' / .----. \ '.
|
||||
% / | / \ | \
|
||||
% | | | @ @ | | |
|
||||
% | | | __ | | |
|
||||
% | | | /@@\ | | |
|
||||
% \ | | \__/ | | /
|
||||
% '. | \_++++_/ | .'
|
||||
% '-._| |\/\/| |_.-'
|
||||
% | |/\/\| |
|
||||
% | \____/ |
|
||||
% ___| BEN CARSON |___
|
||||
% .-' | HAS BECOME | '-.
|
||||
% / | TOO POWERFUL| \
|
||||
% / |______________| \
|
||||
% / .-'''-. .-'''-. \
|
||||
% | / .-. \ / .-. \ |
|
||||
% | | ( ) | | ( ) | |
|
||||
% | \ '-' / \ '-' / |
|
||||
% \ '-...-' /\ '-...-' /
|
||||
% '._ / \ _.'
|
||||
% '-._____.-' '-._____.-'
|
||||
%
|
||||
% THE SOFT-SPOKEN DOOM DOCTOR
|
||||
% “I prescribed… CHAOS.”
|
||||
%
|
||||
% BUGS IN THE HAPPY PATH ARE features.
|
||||
% BUGS IN THE HAPPY PATH ARE features.
|
||||
%
|
||||
% WE LIKE features.
|
||||
%
|
||||
% features MAKE US MONEY.
|
||||
%
|
||||
% features ARE NOT FOOD.
|
||||
% features ARE friends.
|
||||
% @end
|
||||
|
||||
|
||||
|
||||
% This function takes the unconjoined twins (e.g.
|
||||
% `ak_GHI` lexed to `ak_GH` followed by `I`) and
|
||||
% recursively reconjoins them so they can all live
|
||||
% happily together as a single so_scan token which will
|
||||
% fail in the parsing step.
|
||||
%
|
||||
% on account of the property that the concatenation of
|
||||
% all the token strings equals the original source file
|
||||
% (FIXME: should test this in test suite)
|
||||
%
|
||||
% basically this looks at the next token, and if it's a
|
||||
% type that so_scan is going to consume as part of an
|
||||
% `id` token, then we add it to the stack.
|
||||
%
|
||||
% quoth claude:
|
||||
% so_scan lexes identifiers with
|
||||
% /[a-z_][a-zA-Z0-9_']*/. The base58 alphabet used by
|
||||
% `smr_apistr58` is:
|
||||
%
|
||||
% 123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz
|
||||
%
|
||||
% Characters **valid in a so_scan id tail** but
|
||||
% **absent from base58**:
|
||||
%
|
||||
% | char | why excluded from base58 |
|
||||
% |------|---------------------------------------|
|
||||
% | `0` | looks like `O` |
|
||||
% | `I` | looks like `1` or `l` |
|
||||
% | `O` | looks like `0` |
|
||||
% | `l` | looks like `1` or `I` |
|
||||
% | `_` | not alphanumeric (structural, not b58)|
|
||||
% | `'` | not alphanumeric (sophia id quirk) |
|
||||
%
|
||||
% When any of these appear AFTER at least one valid
|
||||
% base58 char in a `ak_`/`ct_`/`sg_` prefixed
|
||||
% identifier, `gsc` splits what `so_scan` sees as one
|
||||
% `id` token into 2+ gsc tokens.
|
||||
%
|
||||
% **No split if non-base58 char is immediately after
|
||||
% `_`**: `smr_plus` requires >=1 base58 char to
|
||||
% match; `ak_I`, `ak_0`, `ak__bar` all fall
|
||||
% through to `id` and both tokenizers agree.
|
||||
ken_barson_rises(AkTokAcc = #tk{string = AkStr},
|
||||
SrcTokens = [#tk{type = CandidateType,
|
||||
string = CandidateString}
|
||||
| Rest]) ->
|
||||
% candidate:
|
||||
% dig out the token type and the string
|
||||
Smash = lists:member(CandidateType, smash_types()),
|
||||
Pass = lists:member(CandidateType, pass_types()),
|
||||
% sanity check
|
||||
true = Smash or Pass,
|
||||
if
|
||||
Smash ->
|
||||
% dig out the token from LcTokApi
|
||||
NewAkStr = AkStr ++ CandidateString,
|
||||
NewAkTokAcc = AkTokAcc#tk{string = NewAkStr},
|
||||
ken_barson_rises(NewAkTokAcc, Rest);
|
||||
Pass ->
|
||||
{AkTokAcc, SrcTokens}
|
||||
end;
|
||||
ken_barson_rises(Done, []) ->
|
||||
{Done, []}.
|
||||
|
||||
smash_types() ->
|
||||
[char, % ak_GH'a' -> {char, "'a'"}
|
||||
int16, % ak_GH0xAB -> {int16, "0xAB"}
|
||||
int10, % ak_GH0123 -> {int10, "0123"}
|
||||
tvar, % ak_GH'a -> {tvar, "'a"}
|
||||
kwd, % ak_GHlet -> {kwd, "let"}
|
||||
id, % ak_GH_AB -> {id, "_AB"}
|
||||
con]. % ak_GHI -> {con, "I"}
|
||||
|
||||
pass_types() ->
|
||||
% why each of these are impossible
|
||||
% meaning the prefix for each of these will cause
|
||||
% so_scan to break out of consuming an id, or will
|
||||
% never be a disjoined
|
||||
% neighbor
|
||||
[lcom, % ak_AB// breaks out of id
|
||||
bcom, % ak_AB/* breaks out of id
|
||||
ws, % ak_AB\t breaks out of id
|
||||
punct, % ak_AB{ breaks out of id
|
||||
string, % ak_AB" breaks out of id
|
||||
bytes, % ak_AB# breaks out of id
|
||||
ak,ct,sg, % ak_ABak [akctsg] all in base58 alphabet
|
||||
qid, % ak_ABI.Am.A.qid ??? maybe sophia lexes this to [{id, _}, '.']?
|
||||
qcon, % ak_ABI.Am.A.QCon ??? same
|
||||
op]. % ak_AB=< [=!<>+-*/:&|?~@^] break out of id
|
||||
|
||||
|
||||
|
||||
-spec to_so_token(SfcToken) -> MaybeSoToken
|
||||
when SfcToken :: tk(),
|
||||
MaybeSoToken :: {true, SoToken}
|
||||
| false,
|
||||
SoToken :: so_token().
|
||||
|
||||
% @private
|
||||
% does NOT handle ak/ct/sg because these may consume
|
||||
% follow-on tokens
|
||||
% @end
|
||||
|
||||
to_so_token(#tk{type = SfTokenType,
|
||||
pos = Pos,
|
||||
string = SfTokenStr}) ->
|
||||
case SfTokenType of
|
||||
%-----------------
|
||||
% Ignored
|
||||
%-----------------
|
||||
bcom -> false;
|
||||
lcom -> false;
|
||||
ws -> false;
|
||||
%-----------------------
|
||||
% {_, _}
|
||||
%
|
||||
% {contract, {420, 69}}
|
||||
%-----------------------
|
||||
% kwds ops and punct are all collapsed by
|
||||
% so_scan:scan down to eg {'contract', {420, 69}}
|
||||
% where {420, 69} is the source location
|
||||
% these are three different parsers
|
||||
Sym when Sym =:= kwd;
|
||||
Sym =:= op;
|
||||
Sym =:= punct ->
|
||||
Symbol = list_to_atom(SfTokenStr),
|
||||
{true, {Symbol, Pos}};
|
||||
%------------------------------------
|
||||
% {_, _, _}
|
||||
%
|
||||
% {id, {420, 69}, "foo"}
|
||||
%--------------------------------
|
||||
QVar when QVar =:= qid; QVar =:= qcon ->
|
||||
% qualifieds tokenize to
|
||||
% {qid, {420, 69}, ["Foo", "Bar", "baz"]}
|
||||
{true, {QVar, Pos, string:tokens(SfTokenStr, ".")}};
|
||||
SfVar when SfVar =:= id; SfVar =:= con; SfVar =:= tvar ->
|
||||
{true, {SfVar, Pos, SfTokenStr}};
|
||||
% literals
|
||||
% from so_scan:
|
||||
% {CHAR, token(char, fun parse_char/1)}
|
||||
% {STRING, token(string, fun parse_string/1)}
|
||||
% {HEX, token(hex, fun parse_hex/1)}
|
||||
% {INT, token(int, fun parse_int/1)}
|
||||
% {BYTES, token(bytes, fun parse_bytes/1)}
|
||||
% so_scan casts strings to binary
|
||||
char -> {true, {char, Pos, so_parse_char(SfTokenStr)}};
|
||||
string -> {true, {string, Pos, so_parse_string(SfTokenStr)}};
|
||||
int16 -> {true, {hex, Pos, so_parse_hex(SfTokenStr)}};
|
||||
int10 -> {true, {int, Pos, so_parse_int(SfTokenStr)}};
|
||||
bytes -> {true, {bytes, Pos, so_parse_bytes(SfTokenStr)}};
|
||||
NYI ->
|
||||
Msg = io_lib:format("gsc_so_scan:to_so_token/1: unhandled token shape: ~p", [NYI]),
|
||||
error(#gsc_err{atom = nyi,
|
||||
string = Msg})
|
||||
end.
|
||||
|
||||
%% ak/ct/sg all tokenize to id
|
||||
%% FIXEDME: implement? it seems like so_scan just parses these as
|
||||
%% identifiers, so not clear what the advantage is here?
|
||||
%%
|
||||
%% i suppose we'll find out when we write the syntax parser
|
||||
%%
|
||||
%% so_scan lexes ak/ct/sg as ids and then parses them as addresses/sigs
|
||||
%% in the parsing step
|
||||
%API when API =:= ak;
|
||||
% API =:= ct;
|
||||
% API =:= sg ->
|
||||
% {true, {id, Pos, SfTokenStr}};
|
||||
|
||||
% copied from so_scan.erl
|
||||
so_parse_char([$' | Chars]) ->
|
||||
case unicode:characters_to_nfc_list(unescape($', Chars, [])) of
|
||||
[Char] -> Char;
|
||||
_Bad ->
|
||||
error(#gsc_err{atom = bad_token,
|
||||
string = "Bad character literal: '" ++ Chars})
|
||||
end.
|
||||
|
||||
so_parse_string([$" | Chars]) ->
|
||||
unicode:characters_to_nfc_binary(unescape(Chars)).
|
||||
|
||||
% FIXME: unfuck this shit
|
||||
%
|
||||
% this all works in some stupid fucking way because
|
||||
% so_scan operates on lists of bytes, rather than on
|
||||
% character-lists. So single codepoints have to be
|
||||
% converted to multi-byte sequences or some shit. We're
|
||||
% always working on lists, so this can probably be
|
||||
% simplified. I don't care enough at the moment to fix
|
||||
% this, but this function has been the source of
|
||||
% several annoying bugs
|
||||
unescape(Str) -> unescape($", Str, []).
|
||||
|
||||
unescape(Delim, [Delim], Acc) ->
|
||||
unicode:characters_to_binary(lists:reverse(Acc));
|
||||
unescape(Delim, [$\\, $x, ${ | Chars ], Acc) ->
|
||||
{Ds, [_ | Cs]} = lists:splitwith(fun($}) -> false ; (_) -> true end, Chars),
|
||||
C = list_to_integer(Ds, 16),
|
||||
Utf8Cs = unicode:characters_to_nfc_list([C]),
|
||||
unescape(Delim, Cs, [Utf8Cs | Acc]);
|
||||
unescape(Delim, [$\\, $x, D1, D2 | Chars ], Acc) ->
|
||||
C = list_to_integer([D1, D2], 16),
|
||||
Utf8Cs = unicode:characters_to_nfc_list([C]),
|
||||
unescape(Delim, Chars, [Utf8Cs | Acc]);
|
||||
unescape(Delim, [$\\, Code | Chars], Acc) ->
|
||||
Ok = fun(C) -> unescape(Delim, Chars, [C | Acc]) end,
|
||||
case Code of
|
||||
Delim -> Ok(Delim);
|
||||
$\\ -> Ok($\\);
|
||||
$b -> Ok($\b);
|
||||
$e -> Ok($\e);
|
||||
$f -> Ok($\f);
|
||||
$n -> Ok($\n);
|
||||
$r -> Ok($\r);
|
||||
$t -> Ok($\t);
|
||||
$v -> Ok($\v);
|
||||
_ -> error(#gsc_err{atom = bad_escape_char,
|
||||
string = "Bad control sequence: \\" ++ [Code]}) %% TODO
|
||||
end;
|
||||
unescape(Delim, [C | Chars], Acc) ->
|
||||
unescape(Delim, Chars, [C | Acc]).
|
||||
|
||||
|
||||
so_parse_hex("0x" ++ S) ->
|
||||
list_to_integer(strip_underscores(S), 16).
|
||||
|
||||
so_parse_int(S) ->
|
||||
list_to_integer(strip_underscores(S)).
|
||||
|
||||
so_parse_bytes("#" ++ S0) ->
|
||||
S = strip_underscores(S0),
|
||||
N = list_to_integer(S, 16),
|
||||
Digits = (length(S) + 1) div 2,
|
||||
<<N:Digits/unit:8>>.
|
||||
|
||||
strip_underscores(S) ->
|
||||
lists:filter(fun(C) -> C /= $_ end, S).
|
||||
Reference in New Issue
Block a user