diff --git a/DONT_README.md b/DONT_README.md index 4e374a9..4dec60f 100644 --- a/DONT_README.md +++ b/DONT_README.md @@ -1,10 +1,15 @@ -# TODONE - # TODO -- barf for outputs, slurp for inputs -- architecture needs more careful thought but only after something works +- architecture needs more careful thought but only after something + works - too fuzzy right now -- possibly: - - rename parser layers sequentially: - - gsc_ +- undo gs_ naming fuckery.. everything is `gsc_*`. it's just + needlessly confusing. for now let's name new things gsc_* and then + go back and undo the stupidity + +# TONOTDO + +- barf for outputs, slurp for inputs +- rename parser layers sequentially + +# TODONE diff --git a/cli/src/gsc_cli.erl b/cli/src/gsc_cli.erl index 69f8b3d..f0d06b8 100644 --- a/cli/src/gsc_cli.erl +++ b/cli/src/gsc_cli.erl @@ -47,6 +47,8 @@ do(["list", "tests"]) -> do_tlist(); do(["test"]) -> do_tests(); +do(["test" | Tests]) -> + do_tests(Tests); do(["tests"]) -> do_tests(); do(["run", "tests"]) -> @@ -106,14 +108,48 @@ less_file(Less, FilePath) -> end. do_tests() -> - io:format("TestModules = ~p~n", [known_modules_with_prefix("ts")]), + io:format("TestModules = ~p~n", [test_mods()]), do_runall_tests(). do_runall_tests() -> lists:foreach(fun run_mod_main/1, test_mods()). + +do_tests(List) -> + lists:foreach(fun run_test/1, List). + +% n +run_test(TestName) -> + % we have two candidate atoms + C1 = list_to_atom(TestName), + C2 = list_to_atom("gsc_test_" ++ TestName), + KnownMods = test_mods(), + IsC1 = lists:member(C1, KnownMods), + IsC2 = lists:member(C2, KnownMods), + if + IsC1 -> rmm(C1); + IsC2 -> rmm(C2); + true -> error({no_such_test, TestName}) + end. + + +rmm(X) -> run_mod_main(X). + +% KnownTests = test_mods(), +% TestMods = ensure_all_known([], List, KnownTests), +% lists:foreach(fun run_mod_main/1, TestMods). + + +%ensure_all_known(Acc, [], _) -> +% lists:sort(Acc); +%ensure_all_known(Acc, [T | Ts], Knowns) -> +% case lists:member(T, Knowns) of +% +% end. + + test_mods() -> - known_modules_with_prefix("gs_test"). + known_modules_with_prefix("gsc_test"). known_modules_with_prefix(Pfx) -> ModsZipBeamsZipLoaded = code:all_available(), diff --git a/cli/src/gsc_test_ntree.erl b/cli/src/gsc_test_ntree.erl new file mode 100644 index 0000000..d7fe151 --- /dev/null +++ b/cli/src/gsc_test_ntree.erl @@ -0,0 +1,109 @@ +-module(gsc_test_ntree). + +-export([ + main/0 +]). + +-include("$gsc_include/gsc.hrl"). + + +main() -> + x00(), + ok. + +% x00 = example00 +x00() -> + io:format("Example 00:~n", []), + io:format(" SrcStr = ~p~n", [x00_src()]), + io:format(" Tokens = ~p~n", [x00_tks()]), + io:format(" Signal = ~p~n", [x00_sgl()]), + io:format(" Tree0 = ~p~n", [x00_tree0()]), + ok. + +% sample type expr, tokens, signal +x00_src() -> "foo => bar * baz". +x00_tks() -> gsc:unsafe_tokens_from_string(x00_src()). +x00_sgl() -> gsc:filter_signal(x00_tks()). +x00_tree0() -> mktree(x00_sgl()). + +% records copypasta for now +-record(ns, {val :: any(), kids :: list(any())}). +-record(nl, {val :: any()}). + +-type ntree(X, Y) :: gsc_ntree:ntree(X, Y). +-type ntree() :: gsc_ntree:ntree(). + +-type ast_stem_t() :: vtokens + | {op, tk()} + . + +-type ast() :: ntree(ast_stem_t(), tk()). + + +-spec mktree(Signal) -> Tree when + Signal :: gsc:signal(), + Tree :: gsc_ntree:ntree(). + +% @doc make into a tree +mktree(Sig) -> + Tree0 = gsc_ntree:nstem(vtokens, Sig), + Tree1 = rerootl_tkstr("=>", Tree0), + Tree2 = rerootl_tkstr("*", Tree1), + Tree2. + + +rerootl_tkstr(S, Tree0 = #ns{val = Root0}) -> + Kids0 = gsc_ntree:deleaf0(Tree0), + IsntS = fun(Tk) -> isnt_str(S, Tk) end, + case lists:splitwith(IsntS, Kids0) of + % found + % input: + % *s Root0 + % | + % +-- .l Foo + % +-- .l "=>" + % +-- .l Bar + % output: + % *s "=>" + % | + % +-- *s Root0 -- .l Foo + % +-- *s Root0 -- .l Bar + {LHS1, [Tk0 | RHS1]} -> + Root1 = Root0, + LTree1 = gsc_ntree:releaf0(Root1, LHS1), + RTree1 = rerootl_tkstr(S, gsc_ntree:releaf0(Root1, RHS1)), + NewRoot0 = {op, Tk0}, + NewKids0 = [LTree1, RTree1], + NewTree = gsc_ntree:releaf0(NewRoot0, NewKids0), + NewTree; + % not found, nothing to do + {Kids0, []} -> + Tree0 + end. + + +%reroot_mapsto(Tree0 = #ns{val = Root0}) -> +% Kids0 = gsc_ntree:deleaf0(Tree0), +% IsntMapsto = fun(DL) -> isnt_str("=>", Tk) end, +% case lists:splitwith(IsntMapsto, Kids0) of +% % found +% {LHS1, [Tk0 | RHS1]} -> +% Root1 = Root0, +% LTree1 = gsc_ntree:releaf0(Root1, LHS1), +% RTree1 = reroot_mapsto(gsc_ntree:releaf0(Root1, RHS1)), +% NewRoot0 = {op, Tk0}, +% NewKids0 = [LTree1, RTree1], +% NewTree = gsc_ntree:releaf0(NewRoot0, NewKids0), +% NewTree; +% % nothing to do +% {Kids0, []} -> +% Tree0 +% end. + + + +isnt_str(X, Y) -> + not is_str(X, Y). + +is_str(S, #tk{str = S}) -> true; +is_str(_, _) -> false. diff --git a/cli/src/gs_test_tokens.erl b/cli/src/gsc_test_tokens.erl similarity index 99% rename from cli/src/gs_test_tokens.erl rename to cli/src/gsc_test_tokens.erl index eabdeaa..4cd1482 100644 --- a/cli/src/gs_test_tokens.erl +++ b/cli/src/gsc_test_tokens.erl @@ -1,5 +1,5 @@ % gsc tokenizer tests --module(gs_test_tokens). +-module(gsc_test_tokens). -export([ main/0, ct_dir/0 diff --git a/cli/src/ts_utils.erl b/cli/src/ts_utils.erl new file mode 100644 index 0000000..7813171 --- /dev/null +++ b/cli/src/ts_utils.erl @@ -0,0 +1,25 @@ +% testing utilities +-module(ts_utils). + +-export([ + ct_dir/0, + ct_file/1 +]). + + +-spec ct_dir() -> string(). + +% directory containing the tests for the tokenizer +ct_dir() -> + zx_daemon:get_home() ++ "/ct". + + + +-spec ct_file(Name) -> AbsPath when + Name :: string(), + AbsPath :: string(). +% @doc +% ct_file("foo.aes") -> "/path/to/ct/foo.aes" + +ct_file(Name) -> + ct_dir() ++ "/" ++ Name. diff --git a/scratch/gsc_ast.erl b/scratch/gsc_ast.erl index 3fd6b45..be6b848 100644 --- a/scratch/gsc_ast.erl +++ b/scratch/gsc_ast.erl @@ -196,13 +196,13 @@ %gulp_file([]) -> % {error, empty_file}; %gulp_file(Tokens) -> -% case gs_tokens:take_block(Tokens) of +% case gsc_tokens:take_block(Tokens) of % {Tokens, []} -> % gulp_block(fun gulp_top_decl/1, Tokens); % %gulp_file2([], [], Tokens); % {A, B} -> -% StartPos = gs_tokens:start_pos(A), -% ErrPos = gs_tokens:start_pos(B), +% StartPos = gsc_tokens:start_pos(A), +% ErrPos = gsc_tokens:start_pos(B), % Msg = efmt("gulp_file: block starting at ~p ends at ~p instead of EOF", % [StartPos, ErrPos]), % {error, #parse_error{pos = ErrPos, msg = Msg}} @@ -212,7 +212,7 @@ % %%gulp_file2(AccOks, AccErrs, Tokens = [_ | _]) -> %% % ItemTokens will be nonempty -%% {ItemTokens, NewTokens} = gs_tokens:take_block_item(Tokens), +%% {ItemTokens, NewTokens} = gsc_tokens:take_block_item(Tokens), %% case gulp_top_decl(ItemTokens) of %% {gulp, Ok} -> gulp_file2([Ok | AccOks], AccErrs, NewTokens); %% Err -> gulp_file2(AccOks, [Err | AccErrs], NewTokens) @@ -258,7 +258,7 @@ % %gulp_block(GulpItem, AccOks, AccErrs, Tokens = [_ | _]) -> % % ItemTokens will be nonempty -% {ItemTokens, NewTokens} = gs_tokens:take_block_item(Tokens), +% {ItemTokens, NewTokens} = gsc_tokens:take_block_item(Tokens), % case GulpItem(ItemTokens) of % {gulp, Ok} -> gulp_block(GulpItem, [Ok | AccOks], AccErrs, NewTokens); % Err -> gulp_block(GulpItem, AccOks, [Err | AccErrs], NewTokens) @@ -284,7 +284,7 @@ %% | Using %% @end %gulp_top_decl(DeclTokens) -> -% case gs_tokens:strings(3, DeclTokens) of +% case gsc_tokens:strings(3, DeclTokens) of % ["payable", "contract", "interface"] -> % gulp_nyi(DeclTokens); % ["contract", "interface" | _] -> @@ -410,7 +410,7 @@ %% | (EModifier* 'entrypoint' | FModifier* 'function') Block(FunDecl) %% | Using %gulp_decl(Tokens) -> -% case gs_tokens:strings(1, Tokens) of +% case gsc_tokens:strings(1, Tokens) of % ["type"] -> gulp_type_alias(Tokens); % _ -> gulp_nyi(Tokens) % end. @@ -611,7 +611,7 @@ %% Type1 = {plist, Types} () (foo) (foo, bar) %% | {token, #tk{}} foo Bar.baz 'quux %slurp_type1(Tks) -> -% case gs_tokens:slurp_plist(Tks) of +% case gsc_tokens:slurp_plist(Tks) of % % head token is NOT open paren -> must be id/qid/tvar % {slurp, [], [Tk | NewTks]} -> % TkType = Tk#tk.type, @@ -633,7 +633,7 @@ % % %%slurp_type_expr_plist(Tks) -> -%% case gs_tokens:slurp_plist(Tks) of +%% case gsc_tokens:slurp_plist(Tks) of %% % head token is NOT open paren -> must be id/qid/tvar %% {slurp, [], [Tk | NewTks]} -> %% TkType = Tk#tk.type, diff --git a/scratch/gsc_parse_type_expr.erl b/scratch/gsc_parse_type_expr.erl index 52d39a6..c3f162e 100644 --- a/scratch/gsc_parse_type_expr.erl +++ b/scratch/gsc_parse_type_expr.erl @@ -43,7 +43,7 @@ % %% @doc for testing %unsafe_vtks_from_string(S) -> -% {ok, SigTks} = gs_tokens:significant_tokens(S), +% {ok, SigTks} = gsc_tokens:significant_tokens(S), % {gulp, Vtks} = gulp_vtks(SigTks), % Vtks. % @@ -110,7 +110,7 @@ % end. % %slurp_plist_rec(Tokens = [#tk{string = "(" | _]) -> -% case gs_tokens:slurp_plist(Tokens) of +% case gsc_tokens:slurp_plist(Tokens) of % {slurp, [], _} -> % barf; % {slurp, PTokens, NewTokens} -> @@ -156,7 +156,7 @@ % {_Pfx = Tks1_BeforeOpen, % _Sfx = Tks2_OpenNAfter % = [#tk{string = "("} | _]} -> -% case gs_tokens:slurp_plist(Tks2_OpenNAfter) of +% case gsc_tokens:slurp_plist(Tks2_OpenNAfter) of % {slurp, Tks2A_OpenToClose, Tks2B_AfterClose} -> % NewAcc = [Acc, % Tks1_BeforeOpen, diff --git a/scratch/gsc_token_chunks.erl b/scratch/gsc_token_chunks.erl index 2634676..8156339 100644 --- a/scratch/gsc_token_chunks.erl +++ b/scratch/gsc_token_chunks.erl @@ -63,7 +63,7 @@ %-spec end_pos([gsc_token()]) -> {value, tk_pos()} | none. % %end_pos([#gsc_token{pos = Pos, string = Str}]) -> -% {value, gs_tokens:new_pos(Pos, Str)}; +% {value, gsc_tokens:new_pos(Pos, Str)}; %end_pos([_ | T]) -> % end_pos(T); %end_pos([]) -> diff --git a/src/gsc.erl b/src/gsc.erl index 9bcb283..2fa5aad 100644 --- a/src/gsc.erl +++ b/src/gsc.erl @@ -4,12 +4,12 @@ % based on original sophia compiler % % parse layers: -% 1. gs_tokens: SrcStr -> (Tokens | SigTokens) +% 1. gsc_tokens: SrcStr -> (Tokens | SigTokens) % % SigTokens = not comment/whitespace % % layers: -% a. gs_strmatch : matches string shapes +% a. gsc_strmatch : matches string shapes % b. gso_scan : converts to so_scan shapes % % @@ -32,14 +32,29 @@ -module(gsc). -export_type([ - token/0 + token/0, + signal/0 ]). -export([ + unsafe_tokens_from_file/1, + unsafe_tokens_from_string/1, + unsafe_signal_from_file/1, + unsafe_signal_from_string/1, + filter_signal/1, + signal_from_string/1, + signal_from_file/1, sigtokens_from_file/1, sigtokens_from_string/1, tokens_from_file/1, - tokens_from_string/1 + tokens_from_string/1, + % sophia compatibility + gso_tokens_from_file/1, + gso_tokens_from_string/1, + % unicode normalization + very_stable_codepoints/1, + very_stable_string/1, + very_stable_file/1 ]). -include("$gsc_include/gsc.hrl"). @@ -50,19 +65,52 @@ -type token() :: tk(). +% @doc signal means non-noise (whitespace/comment) +% tokens; legacy name still around is "sigtokens" +-type signal() :: [tk()]. + %----------------------------------------- -% functions +% API: FUNCTIONS %----------------------------------------- +%----------------------------------------- +% aint nobody got time for case shit +%----------------------------------------- +% tokens +unsafe_tokens_from_file(F) -> + {ok, Tks} = tokens_from_file(F), + Tks. + +unsafe_tokens_from_string(S) -> + {ok, Tks} = tokens_from_string(S), + Tks. + + +% signal +unsafe_signal_from_file(F) -> + {ok, Tks} = signal_from_file(F), + Tks. + +unsafe_signal_from_string(S) -> + {ok, Tks} = signal_from_string(S), + Tks. + + +% +filter_signal(X) -> gsc_tokens:filter_significant(X). +signal_from_file(X) -> sigtokens_from_file(X). +signal_from_string(X) -> sigtokens_from_string(X). + +% @doc legacy name for signal sigtokens_from_file(X) -> case tokens_from_file(X) of - {ok, Y} -> {ok, gs_tokens:filter_significant(Y)}; + {ok, Y} -> {ok, gsc_tokens:filter_significant(Y)}; Err -> Err end. sigtokens_from_string(X) -> case tokens_from_string(X) of - {ok, Y} -> {ok, gs_tokens:filter_significant(Y)}; + {ok, Y} -> {ok, gsc_tokens:filter_significant(Y)}; Err -> Err end. @@ -81,7 +129,6 @@ tokens_from_file(FilePath) -> - -spec tokens_from_string(SrcStr) -> Result when SrcStr :: string(), Result :: {ok, Tokens} @@ -89,4 +136,72 @@ tokens_from_file(FilePath) -> Tokens :: [tk()]. tokens_from_string(SrcStr) -> - gs_tokens:tokens(SrcStr). + gsc_tokens:tokens(SrcStr). + + + +-spec gso_tokens_from_file(FilePath) -> Result when + FilePath :: string(), + Result :: {ok, GsoTks} | {error, Reason}, + GsoTks :: [gso_scan:so_token()], + Reason :: gsc_err() | any(). + +gso_tokens_from_file(FilePath) -> + case file:read_file(FilePath) of + {ok, Bytes} -> gso_tokens_from_string(Bytes); + Error -> Error + end. + + + +-spec gso_tokens_from_string(Str) -> Result when + Str :: iolist(), + Result :: {ok, GsoTks} | {error, Reason}, + GsoTks :: [gso_scan:so_token()], + Reason :: gsc_err() | any(). + +gso_tokens_from_string(Evil) -> + Str = gsc_tokens:very_stable_codepoints(Evil), + gso_scan:scan(Str). + + + + +-spec very_stable_codepoints(String) -> Normalized when + String :: iolist(), + Normalized :: string(). + +%% @doc normalize string to utf8 NFC list form +very_stable_codepoints(X) -> + gsc_tokens:very_stable_codepoints(X). + + + +-spec very_stable_string(String) -> Normalized when + String :: iolist(), + Normalized :: string(). + +%% @doc alias for `very_stable_codepoints/1' +very_stable_string(X) -> + gsc_tokens:very_stable_codepoints(X). + + + +-spec very_stable_file(FilePath) -> Contents when + FilePath :: string(), + Contents :: string(). + +%% @doc Read file, return contents as +%% `unicode:characters_to_nfc_list/1' list. +%% +%% Please note that this function is NOT in fact very +%% stable, as it throws an error if there's some error +%% reading the file (e.g. not found). +%% +%% this function exists mostly for scripting/shell +%% convenience +very_stable_file(X) -> + case file:read_file(X) of + {ok, B} -> very_stable_codepoints(B); + Error -> error(Error) + end. diff --git a/src/gsc_ntree.erl b/src/gsc_ntree.erl new file mode 100644 index 0000000..264f941 --- /dev/null +++ b/src/gsc_ntree.erl @@ -0,0 +1,127 @@ +-module(gsc_ntree). + +-export_type([ + ntree/2, + ntree/0 +]). + +-export([ + nstem/2, + flatten/1, + deleaf0/1, + releaf0/2 +]). + + +-include("$gsc_include/gsc.hrl"). + +%%===================================================== +%% API: types +%%===================================================== + +-record(ns, {val :: any(), kids :: list(any())}). +-record(nl, {val :: any()}). + +%% @doc ntree(S, L) is a "node tree" (meaning stems +%% have values and children) +-type ntree(S, L) + :: #ns{val :: S, kids :: [ntree(S, L)]} + | #nl{val :: L}. + +-type ntree() :: ntree(any(), any()). + + +%%===================================================== +%% API: functions +%%===================================================== + + +-spec nstem(Root, List) -> Tree when + Root :: X, + List :: list(Y), + Tree :: ntree(X, Y), + X :: any(), + Y :: any(). +% @doc +% You *probably* want `releaf0/2' instead. +% +% This function naively wraps each element in the list +% in a leaf type, even if it's already wrapped. +% +% nstem(root, [Foo, Bar, Baz]) ~> +% *s root +% | +% +--- .l Foo +% | +% +--- .l Bar +% | +% +--- .l Baz +% +% Much more common use case is to releaf only the input +% nodes which are not already wrapped, which is what +% `releaf0/2' does. +% @end +nstem(Root, List) -> + {ns, Root, [{nl, Y} || Y <- List]}. + + + +-spec flatten(Tree) -> LeafVals when + Tree :: ntree(any(), LeafType), + LeafVals :: [LeafType], + LeafType :: any(). + +flatten({nl, X}) -> + [X]; +flatten({ns, _, Keeids}) -> + lists:flatten([flatten(Keeid) || Keeid <- Keeids]). + + + +-spec deleaf0(Tree) -> Result when + Tree :: ntree(S, L), + Result :: [L | Tree], + S :: any(), + L :: any(). + +% @doc unwrap the leaf children, and leave the stem +% children intact +% +% ex. 1: +% (+ 1 2 (* 3 4) 5) +% ~> '(1 2 (* 3 4) 5) +% +% ex. 2: +% {ns, '+', [{nl, 1}, +% {nl, 2}, +% {ns, '*', [{nl, 3}, {nl, 4}]}, +% {nl, 5}]} +% ~> [1, 2, {ns, '*', [{nl, 3}, {nl, 4}]}, 5] +% @end +deleaf0({nl, L}) -> [L]; +deleaf0({ns, _, Ls}) -> dl0([], Ls). + +dl0(Stk, []) -> lists:reverse(Stk); +dl0(Stk, [{nl, X} | Rest]) -> dl0([X | Stk], Rest); +dl0(Stk, [X | Rest]) -> dl0([X | Stk], Rest). + + + +-spec releaf0(Root, Keeids) -> Rooted when + Root :: S, + Keeids :: [L | ntree(S, L)], + Rooted :: ntree(S, L), + S :: any(), + L :: any(). + +% @doc notional inverse of `deleaf0/1' +% +% Note that this does **NOT** double-wrap leafs in the +% input +releaf0(Root, Ks) -> + #ns{val = Root, + kids = lists:map(fun rl0/1, Ks)}. + +rl0(X = #ns{}) -> X; +rl0(X = #nl{}) -> X; +rl0(X) -> {nl, X}. diff --git a/src/gs_strmatch.erl b/src/gsc_strmatch.erl similarity index 99% rename from src/gs_strmatch.erl rename to src/gsc_strmatch.erl index 9fd6231..02992a8 100644 --- a/src/gs_strmatch.erl +++ b/src/gsc_strmatch.erl @@ -70,7 +70,7 @@ % `contract` gets tokenized as a keyword and not a variable name), and then % calls into this module in order to match the string shape it's looking for. % @end --module(gs_strmatch). +-module(gsc_strmatch). %-compile([export_all, nowarn_export_all]). diff --git a/src/gs_tokens.erl b/src/gsc_tokens.erl similarity index 91% rename from src/gs_tokens.erl rename to src/gsc_tokens.erl index 946b7c7..2c6ec56 100644 --- a/src/gs_tokens.erl +++ b/src/gsc_tokens.erl @@ -16,7 +16,7 @@ % 2. to future-proof in case we decide to incrementally incorporate the gsc % code into the legacy sophia compiler % @end --module(gs_tokens). +-module(gsc_tokens). % meta -export([ @@ -39,6 +39,9 @@ is_significant/1, filter_significant/1, significant_tokens/1, + very_stable_codepoints/1, + very_stable_string/1, + very_stable_characters/1, tokens_from_iolist/1, tokens/1, slurp_token/2, @@ -188,13 +191,13 @@ slurp_dlist(All, Opens, [#tk{str = "["} = Tk | NewTks]) -> slurp_dlist(All, Opens, [#tk{str = "{"} = Tk | NewTks]) -> slurp_dlist([Tk | All], [Tk | Opens], NewTks); % sad: mismatch cases -slurp_dlist(All, Opens, []) -> +slurp_dlist(_, Opens, []) -> {error, {fixme, mismatch, Opens, none}}; -slurp_dlist(All, Opens, [#tk{str = "}"} = BadClose | _]) -> +slurp_dlist(_, Opens, [#tk{str = "}"} = BadClose | _]) -> {error, {fixme, mismatch, Opens, {value, BadClose}}}; -slurp_dlist(All, Opens, [#tk{str = "]"} = BadClose | _]) -> +slurp_dlist(_, Opens, [#tk{str = "]"} = BadClose | _]) -> {error, {fixme, mismatch, Opens, {value, BadClose}}}; -slurp_dlist(All, Opens, [#tk{str = ")"} = BadClose | _]) -> +slurp_dlist(_, Opens, [#tk{str = ")"} = BadClose | _]) -> {error, {fixme, mismatch, Opens, {value, BadClose}}}; % general case: non-terminal token gets pushed slurp_dlist(All, Opens, [Tk | NewTks]) -> @@ -330,6 +333,29 @@ is_significant(#tk{shape = ws}) -> false; is_significant(_) -> true. + +% aliases +very_stable_string(X) -> very_stable_codepoints(X). +very_stable_characters(X) -> very_stable_codepoints(X). + + + +-spec very_stable_codepoints(IoList) -> NfcList when + IoList :: iolist(), + NfcList :: string(). + +%% @doc When Unicode sends its characters, they're not +%% sending their best. They're not sending ASCII. +%% They're not sending ASCII. They're sending +%% characters that have lots of problems, and they're +%% bringing those problems with us. They're bringing +%% diacritics. They're bringing homoglyphs. They're +%% bringing RTL. They're rapists. And some, we assume, +%% are good characters. +very_stable_codepoints(S) -> + unicode:characters_to_nfc_list(S). + + -spec tokens_from_iolist(SrcStr) -> Result when SrcStr :: iolist(), Result :: {ok, Tokens} @@ -341,6 +367,7 @@ tokens_from_iolist(S) -> tokens(S). + -spec tokens(SrcStr) -> Result when SrcStr :: iolist(), Result :: {ok, Tokens} @@ -355,7 +382,8 @@ tokens_from_iolist(S) -> tokens(S). tokens(S) -> % defensive normalization - tokens([], {1, 1}, unicode:characters_to_nfc_list(S)). + tokens([], {1, 1}, very_stable_codepoints(S)). + tokens(Stack, _FinalPos, "") -> {ok, lists:reverse(Stack)}; @@ -559,8 +587,8 @@ slurp_token_of_shape(bcom, Pos, SrcStr0) -> no_tokmatch end; slurp_token_of_shape(ws, Pos, SrcStr) -> - WhitespaceMatcher = gs_strmatch:smr_sf_ws(), - case gs_strmatch:match(WhitespaceMatcher, SrcStr) of + WhitespaceMatcher = gsc_strmatch:smr_sf_ws(), + case gsc_strmatch:match(WhitespaceMatcher, SrcStr) of no_strmatch -> no_tokmatch; {strmatch, WS, Rest} -> @@ -594,7 +622,7 @@ slurp_token_of_shape(kwd, Pos, SrcStr) -> no_tokmatch end; slurp_token_of_shape(op, Pos, SrcStr) -> - case gs_strmatch:match(gs_strmatch:smr_sf_op(), SrcStr) of + case gsc_strmatch:match(gsc_strmatch:smr_sf_op(), SrcStr) of {strmatch, Str, Rest} -> Token = #tk{shape = op, pos = Pos, str = Str}, {tokmatch, Token, Rest}; @@ -602,7 +630,7 @@ slurp_token_of_shape(op, Pos, SrcStr) -> no_tokmatch end; slurp_token_of_shape(punct, Pos, SrcStr) -> - case gs_strmatch:match(gs_strmatch:smr_sf_punct(), SrcStr) of + case gsc_strmatch:match(gsc_strmatch:smr_sf_punct(), SrcStr) of {strmatch, Str, Rest} -> Token = #tk{shape = punct, pos = Pos, str = Str}, {tokmatch, Token, Rest}; @@ -611,7 +639,7 @@ slurp_token_of_shape(punct, Pos, SrcStr) -> end; % SOPHIA VARIABLE NAMES: id, con, qid, qcon, tvar slurp_token_of_shape(id, Pos, SrcStr) -> - case gs_strmatch:match(gs_strmatch:smr_sf_id(), SrcStr) of + case gsc_strmatch:match(gsc_strmatch:smr_sf_id(), SrcStr) of {strmatch, IdStr, Rest} -> Token = #tk{shape = id, pos = Pos, str = IdStr}, {tokmatch, Token, Rest}; @@ -619,7 +647,7 @@ slurp_token_of_shape(id, Pos, SrcStr) -> no_tokmatch end; slurp_token_of_shape(con, Pos, SrcStr) -> - case gs_strmatch:match(gs_strmatch:smr_sf_con(), SrcStr) of + case gsc_strmatch:match(gsc_strmatch:smr_sf_con(), SrcStr) of {strmatch, Str, Rest} -> Token = #tk{shape = con, pos = Pos, str = Str}, {tokmatch, Token, Rest}; @@ -627,7 +655,7 @@ slurp_token_of_shape(con, Pos, SrcStr) -> no_tokmatch end; slurp_token_of_shape(qid, Pos, SrcStr) -> - case gs_strmatch:match(gs_strmatch:smr_sf_qid(), SrcStr) of + case gsc_strmatch:match(gsc_strmatch:smr_sf_qid(), SrcStr) of {strmatch, Str, Rest} -> Token = #tk{shape = qid, pos = Pos, str = Str}, {tokmatch, Token, Rest}; @@ -635,7 +663,7 @@ slurp_token_of_shape(qid, Pos, SrcStr) -> no_tokmatch end; slurp_token_of_shape(qcon, Pos, SrcStr) -> - case gs_strmatch:match(gs_strmatch:smr_sf_qcon(), SrcStr) of + case gsc_strmatch:match(gsc_strmatch:smr_sf_qcon(), SrcStr) of {strmatch, Str, Rest} -> Token = #tk{shape = qcon, pos = Pos, str = Str}, {tokmatch, Token, Rest}; @@ -643,7 +671,7 @@ slurp_token_of_shape(qcon, Pos, SrcStr) -> no_tokmatch end; slurp_token_of_shape(tvar, Pos, SrcStr) -> - case gs_strmatch:match(gs_strmatch:smr_sf_tvar(), SrcStr) of + case gsc_strmatch:match(gsc_strmatch:smr_sf_tvar(), SrcStr) of {strmatch, Str, Rest} -> Token = #tk{shape = tvar, pos = Pos, str = Str}, {tokmatch, Token, Rest}; @@ -651,7 +679,7 @@ slurp_token_of_shape(tvar, Pos, SrcStr) -> no_tokmatch end; slurp_token_of_shape(int16, Pos, SrcStr) -> - case gs_strmatch:match(gs_strmatch:smr_sf_int16(), SrcStr) of + case gsc_strmatch:match(gsc_strmatch:smr_sf_int16(), SrcStr) of {strmatch, Str, Rest} -> Token = #tk{shape = int16, pos = Pos, str = Str}, {tokmatch, Token, Rest}; @@ -659,7 +687,7 @@ slurp_token_of_shape(int16, Pos, SrcStr) -> no_tokmatch end; slurp_token_of_shape(int10, Pos, SrcStr) -> - case gs_strmatch:match(gs_strmatch:smr_sf_int10(), SrcStr) of + case gsc_strmatch:match(gsc_strmatch:smr_sf_int10(), SrcStr) of {strmatch, Str, Rest} -> Token = #tk{shape = int10, pos = Pos, str = Str}, {tokmatch, Token, Rest}; @@ -671,8 +699,8 @@ slurp_token_of_shape(int10, Pos, SrcStr) -> % % char: sophia char literal slurp_token_of_shape(ak, Pos, SrcStr) -> - StringMatcher = gs_strmatch:smr_sf_ak(), - case gs_strmatch:match(StringMatcher, SrcStr) of + StringMatcher = gsc_strmatch:smr_sf_ak(), + case gsc_strmatch:match(StringMatcher, SrcStr) of no_strmatch -> no_tokmatch; {strmatch, TokenStr, Rest} -> @@ -680,8 +708,8 @@ slurp_token_of_shape(ak, Pos, SrcStr) -> {tokmatch, Token, Rest} end; slurp_token_of_shape(ct, Pos, SrcStr) -> - StringMatcher = gs_strmatch:smr_sf_ct(), - case gs_strmatch:match(StringMatcher, SrcStr) of + StringMatcher = gsc_strmatch:smr_sf_ct(), + case gsc_strmatch:match(StringMatcher, SrcStr) of no_strmatch -> no_tokmatch; {strmatch, TokenStr, Rest} -> @@ -689,8 +717,8 @@ slurp_token_of_shape(ct, Pos, SrcStr) -> {tokmatch, Token, Rest} end; slurp_token_of_shape(sg, Pos, SrcStr) -> - StringMatcher = gs_strmatch:smr_sf_sg(), - case gs_strmatch:match(StringMatcher, SrcStr) of + StringMatcher = gsc_strmatch:smr_sf_sg(), + case gsc_strmatch:match(StringMatcher, SrcStr) of no_strmatch -> no_tokmatch; {strmatch, TokenStr, Rest} -> @@ -698,8 +726,8 @@ slurp_token_of_shape(sg, Pos, SrcStr) -> {tokmatch, Token, Rest} end; slurp_token_of_shape(char, Pos, SrcStr) -> - StringMatcher = gs_strmatch:smr_sf_char(), - case gs_strmatch:match(StringMatcher, SrcStr) of + StringMatcher = gsc_strmatch:smr_sf_char(), + case gsc_strmatch:match(StringMatcher, SrcStr) of no_strmatch -> no_tokmatch; {strmatch, TokenStr, Rest} -> @@ -707,7 +735,7 @@ slurp_token_of_shape(char, Pos, SrcStr) -> {tokmatch, Token, Rest} end; slurp_token_of_shape(string, Pos, SrcStr) -> - case gs_strmatch:match(gs_strmatch:smr_sf_str(), SrcStr) of + case gsc_strmatch:match(gsc_strmatch:smr_sf_str(), SrcStr) of no_strmatch -> no_tokmatch; {strmatch, TokenStr, Rest} -> @@ -715,7 +743,7 @@ slurp_token_of_shape(string, Pos, SrcStr) -> {tokmatch, Token, Rest} end; slurp_token_of_shape(bytes, Pos, SrcStr) -> - case gs_strmatch:match(gs_strmatch:smr_sf_bytes(), SrcStr) of + case gsc_strmatch:match(gsc_strmatch:smr_sf_bytes(), SrcStr) of no_strmatch -> no_tokmatch; {strmatch, TokenStr, Rest} -> diff --git a/src/gso_scan.erl b/src/gso_scan.erl index 9b7fb2d..374c191 100644 --- a/src/gso_scan.erl +++ b/src/gso_scan.erl @@ -1,6 +1,6 @@ % @doc compatibility layer to test against so_scan % -% converts gs_tokens data to so_scan tokens +% converts gsc_tokens data to so_scan tokens % % Ref: so_scan.erl -module(gso_scan). @@ -104,7 +104,7 @@ % @end scan(SrcStr) -> - case gs_tokens:tokens(SrcStr) of + case gsc_tokens:tokens(SrcStr) of {ok, SfLTokens} -> SoTokens = to_so_tokens(SfLTokens), {ok, SoTokens};