% @doc compatibility layer to test against so_scan % % converts gsc_tokens data to so_scan tokens % % Ref: so_scan.erl -module(gso_scan). -export_type([ so_kwd/0, so_special_char/0, so_symbol/0, so_token2/0, so_token3/0, so_token/0 ]). -export([ scan/1, ken_barson_rises/2 ]). -include("$gsc_include/gsc.hrl"). %================================ % API: types %================================ % FIXME: single-quote all the atoms to future-proof against the elixir retards % adding more keywords to erlang -type so_kwd() :: contract | include | 'let' | switch | type | record | datatype | 'if' | elif | 'else' | function | stateful | payable | 'true' | 'false' | mod | public | entrypoint | private | indexed | namespace | interface | main | using | as | for | hiding | 'band' | 'bor' | 'bxor' | 'bnot'. -type so_special_char() :: '..' | ',' | '.' | ';' | '(' | ')' | '[' | ']' | '{' | '}' . % @doc bad type... essentially a string that is the outcome of a regex match is % cast to an atom, and that's the type that goes here -type so_symbol() :: so_kwd() | so_special_char() | atom(). -type so_token2() :: {Symbol :: so_symbol(), Location :: tk_pos()}. % FIXME % this is 'id', 'con', qid -type so_tk3type() :: char | string | hex | int | bytes | qid | qcon | tvar | id | con. -type so_token3() :: {TokenType :: so_tk3type(), Location :: tk_pos(), TokenValue :: term()}. -type so_token() :: so_token2() | so_token3(). %================================ % API: functions %================================ -spec scan(SrcStr) -> {ok, SoTokens} | {error, gsc_err()} when SrcStr :: iolist(), SoTokens :: [so_token()]. % @doc % this is meant to agree with so_scan:scan/1 in all cases % % this converts gsc's internal representation of tokens into the format that % so_scan outputs % @end scan(SrcStr) -> case gsc_tokens:tokens(SrcStr) of {ok, SfLTokens} -> SoTokens = to_so_tokens(SfLTokens), {ok, SoTokens}; % fucking stupid {error, #gsc_err_bcom_unterminated{prev_tokens = GscTokens}} -> {ok, to_so_tokens(GscTokens)}; Error -> Error end. -spec to_so_tokens(GscTokens) -> SoTokens when GscTokens :: [tk()], SoTokens :: [so_token()]. % @doc % most gsc tokens map 1-to-1 with so_tokens. the % exception is ak/ct/sg literals. this is a % many-to-one-mapping, and therefore ak, sg, ct need to % be handled at the list level. % % the reason is as follows: % % so_scan lexes ak_ABCD to an id, then at the parsing % stage computes the pubkey that corresponds to. % % as a result, if we have ak_GHI, I is not a valid % base58 char, so WE (gsc) end up lexing that as % % [{ak, "ak_GH"}, {con, "I"}] % % and so_scan lexes that as {id, "ak_GHI"}]. % % however we also don't ignore whitespace, so we can % tell if this happens because it occurs precisely when % an ak/sg/ct is immediately followed by a % non-whitespace token. however there could be more % than 1 and they can be a variety of different shapes. % so we have to greedily consume them back into a % single id. % % bugs in the happy path are trans-features % % so if we see an ak/ct/sg token, we summon evil ben % carson to reconjoin the unconjoined twins to_so_tokens([ AkTok = #tk{shape = AkCtSg, pos = Pos} | Sheeit]) when ak =:= AkCtSg; ct =:= AkCtSg; sg =:= AkCtSg -> {#tk{str = FinalAkStr}, NewSheeit} = ken_barson_rises(AkTok, Sheeit), [{id, Pos, FinalAkStr}| to_so_tokens(NewSheeit)]; % this part is just lists:filtermap to_so_tokens([X | Xs]) -> case to_so_token(X) of false -> to_so_tokens(Xs); {true, SoToken} -> [SoToken | to_so_tokens(Xs)] end; to_so_tokens([]) -> []. -spec ken_barson_rises(InitApiToken, SfToks) -> {FinalApiToken, NewSfToks} when InitApiToken :: tk(), SfToks :: [tk()], FinalApiToken :: InitApiToken, NewSfToks :: SfToks. % @doc % % .-""""""""""""-. % .-' .-======-. '-. % .' / .----. \ '. % / | / \ | \ % | | | @ @ | | | % | | | __ | | | % | | | /@@\ | | | % \ | | \__/ | | / % '. | \_++++_/ | .' % '-._| |\/\/| |_.-' % | |/\/\| | % | \____/ | % ___| BEN CARSON |___ % .-' | HAS BECOME | '-. % / | TOO POWERFUL| \ % / |______________| \ % / .-'''-. .-'''-. \ % | / .-. \ / .-. \ | % | | ( ) | | ( ) | | % | \ '-' / \ '-' / | % \ '-...-' /\ '-...-' / % '._ / \ _.' % '-._____.-' '-._____.-' % % THE SOFT-SPOKEN DOOM DOCTOR % “I prescribed… CHAOS.” % % BUGS IN THE HAPPY PATH ARE features. % BUGS IN THE HAPPY PATH ARE features. % % WE LIKE features. % % features MAKE US MONEY. % % features ARE NOT FOOD. % features ARE friends. % @end % This function takes the unconjoined twins (e.g. % `ak_GHI` lexed to `ak_GH` followed by `I`) and % recursively reconjoins them so they can all live % happily together as a single so_scan token which will % fail in the parsing step. % % on account of the property that the concatenation of % all the token strings equals the original source file % (FIXME: should test this in test suite) % % basically this looks at the next token, and if it's a % type that so_scan is going to consume as part of an % `id` token, then we add it to the stack. % % quoth claude: % so_scan lexes identifiers with % /[a-z_][a-zA-Z0-9_']*/. The base58 alphabet used by % `smr_apistr58` is: % % 123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz % % Characters **valid in a so_scan id tail** but % **absent from base58**: % % | char | why excluded from base58 | % |------|---------------------------------------| % | `0` | looks like `O` | % | `I` | looks like `1` or `l` | % | `O` | looks like `0` | % | `l` | looks like `1` or `I` | % | `_` | not alphanumeric (structural, not b58)| % | `'` | not alphanumeric (sophia id quirk) | % % When any of these appear AFTER at least one valid % base58 char in a `ak_`/`ct_`/`sg_` prefixed % identifier, `gsc` splits what `so_scan` sees as one % `id` token into 2+ gsc tokens. % % **No split if non-base58 char is immediately after % `_`**: `smr_plus` requires >=1 base58 char to % match; `ak_I`, `ak_0`, `ak__bar` all fall % through to `id` and both tokenizers agree. ken_barson_rises(AkTokAcc = #tk{str = AkStr}, SrcTokens = [#tk{shape = CandidateType, str = CandidateString} | Rest]) -> % candidate: % dig out the token type and the string Smash = lists:member(CandidateType, smash_types()), Pass = lists:member(CandidateType, pass_types()), % sanity check true = Smash or Pass, if Smash -> % dig out the token from LcTokApi NewAkStr = AkStr ++ CandidateString, NewAkTokAcc = AkTokAcc#tk{str = NewAkStr}, ken_barson_rises(NewAkTokAcc, Rest); Pass -> {AkTokAcc, SrcTokens} end; ken_barson_rises(Done, []) -> {Done, []}. smash_types() -> [char, % ak_GH'a' -> {char, "'a'"} int16, % ak_GH0xAB -> {int16, "0xAB"} int10, % ak_GH0123 -> {int10, "0123"} tvar, % ak_GH'a -> {tvar, "'a"} kwd, % ak_GHlet -> {kwd, "let"} id, % ak_GH_AB -> {id, "_AB"} con]. % ak_GHI -> {con, "I"} pass_types() -> % why each of these are impossible % meaning the prefix for each of these will cause % so_scan to break out of consuming an id, or will % never be a disjoined % neighbor [lcom, % ak_AB// breaks out of id bcom, % ak_AB/* breaks out of id ws, % ak_AB\t breaks out of id punct, % ak_AB{ breaks out of id string, % ak_AB" breaks out of id bytes, % ak_AB# breaks out of id ak,ct,sg, % ak_ABak [akctsg] all in base58 alphabet qid, % ak_ABI.Am.A.qid ??? maybe sophia lexes this to [{id, _}, '.']? qcon, % ak_ABI.Am.A.QCon ??? same op]. % ak_AB=< [=!<>+-*/:&|?~@^] break out of id -spec to_so_token(GscToken) -> MaybeSoToken when GscToken :: tk(), MaybeSoToken :: {true, SoToken} | false, SoToken :: so_token(). % @private % does NOT handle ak/ct/sg because these may consume % follow-on tokens % @end to_so_token(#tk{shape = SfTokenType, pos = Pos, str = SfTokenStr}) -> case SfTokenType of %----------------- % Ignored %----------------- bcom -> false; lcom -> false; ws -> false; %----------------------- % {_, _} % % {contract, {420, 69}} %----------------------- % kwds ops and punct are all collapsed by % so_scan:scan down to eg {'contract', {420, 69}} % where {420, 69} is the source location % these are three different parsers Sym when Sym =:= kwd; Sym =:= op; Sym =:= punct -> Symbol = list_to_atom(SfTokenStr), {true, {Symbol, Pos}}; %------------------------------------ % {_, _, _} % % {id, {420, 69}, "foo"} %-------------------------------- QVar when QVar =:= qid; QVar =:= qcon -> % qualifieds tokenize to % {qid, {420, 69}, ["Foo", "Bar", "baz"]} {true, {QVar, Pos, string:tokens(SfTokenStr, ".")}}; SfVar when SfVar =:= id; SfVar =:= con; SfVar =:= tvar -> {true, {SfVar, Pos, SfTokenStr}}; % literals % from so_scan: % {CHAR, token(char, fun parse_char/1)} % {STRING, token(string, fun parse_string/1)} % {HEX, token(hex, fun parse_hex/1)} % {INT, token(int, fun parse_int/1)} % {BYTES, token(bytes, fun parse_bytes/1)} % so_scan casts strings to binary char -> {true, {char, Pos, so_parse_char(SfTokenStr)}}; string -> {true, {string, Pos, so_parse_string(SfTokenStr)}}; int16 -> {true, {hex, Pos, so_parse_hex(SfTokenStr)}}; int10 -> {true, {int, Pos, so_parse_int(SfTokenStr)}}; bytes -> {true, {bytes, Pos, so_parse_bytes(SfTokenStr)}}; NYI -> Msg = io_lib:format("gsc_so_scan:to_so_token/1: unhandled token shape: ~p", [NYI]), error(#gsc_err{atom = nyi, str = Msg}) end. %% ak/ct/sg all tokenize to id %% FIXEDME: implement? it seems like so_scan just parses these as %% identifiers, so not clear what the advantage is here? %% %% i suppose we'll find out when we write the syntax parser %% %% so_scan lexes ak/ct/sg as ids and then parses them as addresses/sigs %% in the parsing step %API when API =:= ak; % API =:= ct; % API =:= sg -> % {true, {id, Pos, SfTokenStr}}; % copied from so_scan.erl so_parse_char([$' | Chars]) -> case unicode:characters_to_nfc_list(unescape($', Chars, [])) of [Char] -> Char; _Bad -> error(#gsc_err{atom = bad_token, str = "Bad character literal: '" ++ Chars}) end. so_parse_string([$" | Chars]) -> unicode:characters_to_nfc_binary(unescape(Chars)). % FIXME: unfuck this shit % % this all works in some stupid fucking way because % so_scan operates on lists of bytes, rather than on % character-lists. So single codepoints have to be % converted to multi-byte sequences or some shit. We're % always working on lists, so this can probably be % simplified. I don't care enough at the moment to fix % this, but this function has been the source of % several annoying bugs unescape(Str) -> unescape($", Str, []). unescape(Delim, [Delim], Acc) -> unicode:characters_to_binary(lists:reverse(Acc)); unescape(Delim, [$\\, $x, ${ | Chars ], Acc) -> {Ds, [_ | Cs]} = lists:splitwith(fun($}) -> false ; (_) -> true end, Chars), C = list_to_integer(Ds, 16), Utf8Cs = unicode:characters_to_nfc_list([C]), unescape(Delim, Cs, [Utf8Cs | Acc]); unescape(Delim, [$\\, $x, D1, D2 | Chars ], Acc) -> C = list_to_integer([D1, D2], 16), Utf8Cs = unicode:characters_to_nfc_list([C]), unescape(Delim, Chars, [Utf8Cs | Acc]); unescape(Delim, [$\\, Code | Chars], Acc) -> Ok = fun(C) -> unescape(Delim, Chars, [C | Acc]) end, case Code of Delim -> Ok(Delim); $\\ -> Ok($\\); $b -> Ok($\b); $e -> Ok($\e); $f -> Ok($\f); $n -> Ok($\n); $r -> Ok($\r); $t -> Ok($\t); $v -> Ok($\v); _ -> error(#gsc_err{atom = bad_escape_char, str = "Bad control sequence: \\" ++ [Code]}) %% TODO end; unescape(Delim, [C | Chars], Acc) -> unescape(Delim, Chars, [C | Acc]). so_parse_hex("0x" ++ S) -> list_to_integer(strip_underscores(S), 16). so_parse_int(S) -> list_to_integer(strip_underscores(S)). so_parse_bytes("#" ++ S0) -> S = strip_underscores(S0), N = list_to_integer(S, 16), Digits = (length(S) + 1) div 2, <>. strip_underscores(S) -> lists:filter(fun(C) -> C /= $_ end, S).