stopping point

This commit is contained in:
Peter Harpending
2026-06-02 16:51:05 -07:00
parent 2c36a02331
commit 4f4adaa284
13 changed files with 507 additions and 62 deletions
+12 -7
View File
@@ -1,10 +1,15 @@
# TODONE
# TODO # TODO
- barf for outputs, slurp for inputs - architecture needs more careful thought but only after something
- architecture needs more careful thought but only after something works works
- too fuzzy right now - too fuzzy right now
- possibly: - undo gs_ naming fuckery.. everything is `gsc_*`. it's just
- rename parser layers sequentially: needlessly confusing. for now let's name new things gsc_* and then
- gsc_ go back and undo the stupidity
# TONOTDO
- barf for outputs, slurp for inputs
- rename parser layers sequentially
# TODONE
+38 -2
View File
@@ -47,6 +47,8 @@ do(["list", "tests"]) ->
do_tlist(); do_tlist();
do(["test"]) -> do(["test"]) ->
do_tests(); do_tests();
do(["test" | Tests]) ->
do_tests(Tests);
do(["tests"]) -> do(["tests"]) ->
do_tests(); do_tests();
do(["run", "tests"]) -> do(["run", "tests"]) ->
@@ -106,14 +108,48 @@ less_file(Less, FilePath) ->
end. end.
do_tests() -> do_tests() ->
io:format("TestModules = ~p~n", [known_modules_with_prefix("ts")]), io:format("TestModules = ~p~n", [test_mods()]),
do_runall_tests(). do_runall_tests().
do_runall_tests() -> do_runall_tests() ->
lists:foreach(fun run_mod_main/1, test_mods()). lists:foreach(fun run_mod_main/1, test_mods()).
do_tests(List) ->
lists:foreach(fun run_test/1, List).
% n
run_test(TestName) ->
% we have two candidate atoms
C1 = list_to_atom(TestName),
C2 = list_to_atom("gsc_test_" ++ TestName),
KnownMods = test_mods(),
IsC1 = lists:member(C1, KnownMods),
IsC2 = lists:member(C2, KnownMods),
if
IsC1 -> rmm(C1);
IsC2 -> rmm(C2);
true -> error({no_such_test, TestName})
end.
rmm(X) -> run_mod_main(X).
% KnownTests = test_mods(),
% TestMods = ensure_all_known([], List, KnownTests),
% lists:foreach(fun run_mod_main/1, TestMods).
%ensure_all_known(Acc, [], _) ->
% lists:sort(Acc);
%ensure_all_known(Acc, [T | Ts], Knowns) ->
% case lists:member(T, Knowns) of
%
% end.
test_mods() -> test_mods() ->
known_modules_with_prefix("gs_test"). known_modules_with_prefix("gsc_test").
known_modules_with_prefix(Pfx) -> known_modules_with_prefix(Pfx) ->
ModsZipBeamsZipLoaded = code:all_available(), ModsZipBeamsZipLoaded = code:all_available(),
+109
View File
@@ -0,0 +1,109 @@
-module(gsc_test_ntree).
-export([
main/0
]).
-include("$gsc_include/gsc.hrl").
main() ->
x00(),
ok.
% x00 = example00
x00() ->
io:format("Example 00:~n", []),
io:format(" SrcStr = ~p~n", [x00_src()]),
io:format(" Tokens = ~p~n", [x00_tks()]),
io:format(" Signal = ~p~n", [x00_sgl()]),
io:format(" Tree0 = ~p~n", [x00_tree0()]),
ok.
% sample type expr, tokens, signal
x00_src() -> "foo => bar * baz".
x00_tks() -> gsc:unsafe_tokens_from_string(x00_src()).
x00_sgl() -> gsc:filter_signal(x00_tks()).
x00_tree0() -> mktree(x00_sgl()).
% records copypasta for now
-record(ns, {val :: any(), kids :: list(any())}).
-record(nl, {val :: any()}).
-type ntree(X, Y) :: gsc_ntree:ntree(X, Y).
-type ntree() :: gsc_ntree:ntree().
-type ast_stem_t() :: vtokens
| {op, tk()}
.
-type ast() :: ntree(ast_stem_t(), tk()).
-spec mktree(Signal) -> Tree when
Signal :: gsc:signal(),
Tree :: gsc_ntree:ntree().
% @doc make into a tree
mktree(Sig) ->
Tree0 = gsc_ntree:nstem(vtokens, Sig),
Tree1 = rerootl_tkstr("=>", Tree0),
Tree2 = rerootl_tkstr("*", Tree1),
Tree2.
rerootl_tkstr(S, Tree0 = #ns{val = Root0}) ->
Kids0 = gsc_ntree:deleaf0(Tree0),
IsntS = fun(Tk) -> isnt_str(S, Tk) end,
case lists:splitwith(IsntS, Kids0) of
% found
% input:
% *s Root0
% |
% +-- .l Foo
% +-- .l "=>"
% +-- .l Bar
% output:
% *s "=>"
% |
% +-- *s Root0 -- .l Foo
% +-- *s Root0 -- .l Bar
{LHS1, [Tk0 | RHS1]} ->
Root1 = Root0,
LTree1 = gsc_ntree:releaf0(Root1, LHS1),
RTree1 = rerootl_tkstr(S, gsc_ntree:releaf0(Root1, RHS1)),
NewRoot0 = {op, Tk0},
NewKids0 = [LTree1, RTree1],
NewTree = gsc_ntree:releaf0(NewRoot0, NewKids0),
NewTree;
% not found, nothing to do
{Kids0, []} ->
Tree0
end.
%reroot_mapsto(Tree0 = #ns{val = Root0}) ->
% Kids0 = gsc_ntree:deleaf0(Tree0),
% IsntMapsto = fun(DL) -> isnt_str("=>", Tk) end,
% case lists:splitwith(IsntMapsto, Kids0) of
% % found
% {LHS1, [Tk0 | RHS1]} ->
% Root1 = Root0,
% LTree1 = gsc_ntree:releaf0(Root1, LHS1),
% RTree1 = reroot_mapsto(gsc_ntree:releaf0(Root1, RHS1)),
% NewRoot0 = {op, Tk0},
% NewKids0 = [LTree1, RTree1],
% NewTree = gsc_ntree:releaf0(NewRoot0, NewKids0),
% NewTree;
% % nothing to do
% {Kids0, []} ->
% Tree0
% end.
isnt_str(X, Y) ->
not is_str(X, Y).
is_str(S, #tk{str = S}) -> true;
is_str(_, _) -> false.
@@ -1,5 +1,5 @@
% gsc tokenizer tests % gsc tokenizer tests
-module(gs_test_tokens). -module(gsc_test_tokens).
-export([ -export([
main/0, ct_dir/0 main/0, ct_dir/0
+25
View File
@@ -0,0 +1,25 @@
% testing utilities
-module(ts_utils).
-export([
ct_dir/0,
ct_file/1
]).
-spec ct_dir() -> string().
% directory containing the tests for the tokenizer
ct_dir() ->
zx_daemon:get_home() ++ "/ct".
-spec ct_file(Name) -> AbsPath when
Name :: string(),
AbsPath :: string().
% @doc
% ct_file("foo.aes") -> "/path/to/ct/foo.aes"
ct_file(Name) ->
ct_dir() ++ "/" ++ Name.
+9 -9
View File
@@ -196,13 +196,13 @@
%gulp_file([]) -> %gulp_file([]) ->
% {error, empty_file}; % {error, empty_file};
%gulp_file(Tokens) -> %gulp_file(Tokens) ->
% case gs_tokens:take_block(Tokens) of % case gsc_tokens:take_block(Tokens) of
% {Tokens, []} -> % {Tokens, []} ->
% gulp_block(fun gulp_top_decl/1, Tokens); % gulp_block(fun gulp_top_decl/1, Tokens);
% %gulp_file2([], [], Tokens); % %gulp_file2([], [], Tokens);
% {A, B} -> % {A, B} ->
% StartPos = gs_tokens:start_pos(A), % StartPos = gsc_tokens:start_pos(A),
% ErrPos = gs_tokens:start_pos(B), % ErrPos = gsc_tokens:start_pos(B),
% Msg = efmt("gulp_file: block starting at ~p ends at ~p instead of EOF", % Msg = efmt("gulp_file: block starting at ~p ends at ~p instead of EOF",
% [StartPos, ErrPos]), % [StartPos, ErrPos]),
% {error, #parse_error{pos = ErrPos, msg = Msg}} % {error, #parse_error{pos = ErrPos, msg = Msg}}
@@ -212,7 +212,7 @@
% %
%%gulp_file2(AccOks, AccErrs, Tokens = [_ | _]) -> %%gulp_file2(AccOks, AccErrs, Tokens = [_ | _]) ->
%% % ItemTokens will be nonempty %% % ItemTokens will be nonempty
%% {ItemTokens, NewTokens} = gs_tokens:take_block_item(Tokens), %% {ItemTokens, NewTokens} = gsc_tokens:take_block_item(Tokens),
%% case gulp_top_decl(ItemTokens) of %% case gulp_top_decl(ItemTokens) of
%% {gulp, Ok} -> gulp_file2([Ok | AccOks], AccErrs, NewTokens); %% {gulp, Ok} -> gulp_file2([Ok | AccOks], AccErrs, NewTokens);
%% Err -> gulp_file2(AccOks, [Err | AccErrs], NewTokens) %% Err -> gulp_file2(AccOks, [Err | AccErrs], NewTokens)
@@ -258,7 +258,7 @@
% %
%gulp_block(GulpItem, AccOks, AccErrs, Tokens = [_ | _]) -> %gulp_block(GulpItem, AccOks, AccErrs, Tokens = [_ | _]) ->
% % ItemTokens will be nonempty % % ItemTokens will be nonempty
% {ItemTokens, NewTokens} = gs_tokens:take_block_item(Tokens), % {ItemTokens, NewTokens} = gsc_tokens:take_block_item(Tokens),
% case GulpItem(ItemTokens) of % case GulpItem(ItemTokens) of
% {gulp, Ok} -> gulp_block(GulpItem, [Ok | AccOks], AccErrs, NewTokens); % {gulp, Ok} -> gulp_block(GulpItem, [Ok | AccOks], AccErrs, NewTokens);
% Err -> gulp_block(GulpItem, AccOks, [Err | AccErrs], NewTokens) % Err -> gulp_block(GulpItem, AccOks, [Err | AccErrs], NewTokens)
@@ -284,7 +284,7 @@
%% | Using %% | Using
%% @end %% @end
%gulp_top_decl(DeclTokens) -> %gulp_top_decl(DeclTokens) ->
% case gs_tokens:strings(3, DeclTokens) of % case gsc_tokens:strings(3, DeclTokens) of
% ["payable", "contract", "interface"] -> % ["payable", "contract", "interface"] ->
% gulp_nyi(DeclTokens); % gulp_nyi(DeclTokens);
% ["contract", "interface" | _] -> % ["contract", "interface" | _] ->
@@ -410,7 +410,7 @@
%% | (EModifier* 'entrypoint' | FModifier* 'function') Block(FunDecl) %% | (EModifier* 'entrypoint' | FModifier* 'function') Block(FunDecl)
%% | Using %% | Using
%gulp_decl(Tokens) -> %gulp_decl(Tokens) ->
% case gs_tokens:strings(1, Tokens) of % case gsc_tokens:strings(1, Tokens) of
% ["type"] -> gulp_type_alias(Tokens); % ["type"] -> gulp_type_alias(Tokens);
% _ -> gulp_nyi(Tokens) % _ -> gulp_nyi(Tokens)
% end. % end.
@@ -611,7 +611,7 @@
%% Type1 = {plist, Types} () (foo) (foo, bar) %% Type1 = {plist, Types} () (foo) (foo, bar)
%% | {token, #tk{}} foo Bar.baz 'quux %% | {token, #tk{}} foo Bar.baz 'quux
%slurp_type1(Tks) -> %slurp_type1(Tks) ->
% case gs_tokens:slurp_plist(Tks) of % case gsc_tokens:slurp_plist(Tks) of
% % head token is NOT open paren -> must be id/qid/tvar % % head token is NOT open paren -> must be id/qid/tvar
% {slurp, [], [Tk | NewTks]} -> % {slurp, [], [Tk | NewTks]} ->
% TkType = Tk#tk.type, % TkType = Tk#tk.type,
@@ -633,7 +633,7 @@
% %
% %
%%slurp_type_expr_plist(Tks) -> %%slurp_type_expr_plist(Tks) ->
%% case gs_tokens:slurp_plist(Tks) of %% case gsc_tokens:slurp_plist(Tks) of
%% % head token is NOT open paren -> must be id/qid/tvar %% % head token is NOT open paren -> must be id/qid/tvar
%% {slurp, [], [Tk | NewTks]} -> %% {slurp, [], [Tk | NewTks]} ->
%% TkType = Tk#tk.type, %% TkType = Tk#tk.type,
+3 -3
View File
@@ -43,7 +43,7 @@
% %
%% @doc for testing %% @doc for testing
%unsafe_vtks_from_string(S) -> %unsafe_vtks_from_string(S) ->
% {ok, SigTks} = gs_tokens:significant_tokens(S), % {ok, SigTks} = gsc_tokens:significant_tokens(S),
% {gulp, Vtks} = gulp_vtks(SigTks), % {gulp, Vtks} = gulp_vtks(SigTks),
% Vtks. % Vtks.
% %
@@ -110,7 +110,7 @@
% end. % end.
% %
%slurp_plist_rec(Tokens = [#tk{string = "(" | _]) -> %slurp_plist_rec(Tokens = [#tk{string = "(" | _]) ->
% case gs_tokens:slurp_plist(Tokens) of % case gsc_tokens:slurp_plist(Tokens) of
% {slurp, [], _} -> % {slurp, [], _} ->
% barf; % barf;
% {slurp, PTokens, NewTokens} -> % {slurp, PTokens, NewTokens} ->
@@ -156,7 +156,7 @@
% {_Pfx = Tks1_BeforeOpen, % {_Pfx = Tks1_BeforeOpen,
% _Sfx = Tks2_OpenNAfter % _Sfx = Tks2_OpenNAfter
% = [#tk{string = "("} | _]} -> % = [#tk{string = "("} | _]} ->
% case gs_tokens:slurp_plist(Tks2_OpenNAfter) of % case gsc_tokens:slurp_plist(Tks2_OpenNAfter) of
% {slurp, Tks2A_OpenToClose, Tks2B_AfterClose} -> % {slurp, Tks2A_OpenToClose, Tks2B_AfterClose} ->
% NewAcc = [Acc, % NewAcc = [Acc,
% Tks1_BeforeOpen, % Tks1_BeforeOpen,
+1 -1
View File
@@ -63,7 +63,7 @@
%-spec end_pos([gsc_token()]) -> {value, tk_pos()} | none. %-spec end_pos([gsc_token()]) -> {value, tk_pos()} | none.
% %
%end_pos([#gsc_token{pos = Pos, string = Str}]) -> %end_pos([#gsc_token{pos = Pos, string = Str}]) ->
% {value, gs_tokens:new_pos(Pos, Str)}; % {value, gsc_tokens:new_pos(Pos, Str)};
%end_pos([_ | T]) -> %end_pos([_ | T]) ->
% end_pos(T); % end_pos(T);
%end_pos([]) -> %end_pos([]) ->
+124 -9
View File
@@ -4,12 +4,12 @@
% based on original sophia compiler % based on original sophia compiler
% %
% parse layers: % parse layers:
% 1. gs_tokens: SrcStr -> (Tokens | SigTokens) % 1. gsc_tokens: SrcStr -> (Tokens | SigTokens)
% %
% SigTokens = not comment/whitespace % SigTokens = not comment/whitespace
% %
% layers: % layers:
% a. gs_strmatch : matches string shapes % a. gsc_strmatch : matches string shapes
% b. gso_scan : converts to so_scan shapes % b. gso_scan : converts to so_scan shapes
% %
% %
@@ -32,14 +32,29 @@
-module(gsc). -module(gsc).
-export_type([ -export_type([
token/0 token/0,
signal/0
]). ]).
-export([ -export([
unsafe_tokens_from_file/1,
unsafe_tokens_from_string/1,
unsafe_signal_from_file/1,
unsafe_signal_from_string/1,
filter_signal/1,
signal_from_string/1,
signal_from_file/1,
sigtokens_from_file/1, sigtokens_from_file/1,
sigtokens_from_string/1, sigtokens_from_string/1,
tokens_from_file/1, tokens_from_file/1,
tokens_from_string/1 tokens_from_string/1,
% sophia compatibility
gso_tokens_from_file/1,
gso_tokens_from_string/1,
% unicode normalization
very_stable_codepoints/1,
very_stable_string/1,
very_stable_file/1
]). ]).
-include("$gsc_include/gsc.hrl"). -include("$gsc_include/gsc.hrl").
@@ -50,19 +65,52 @@
-type token() :: tk(). -type token() :: tk().
% @doc signal means non-noise (whitespace/comment)
% tokens; legacy name still around is "sigtokens"
-type signal() :: [tk()].
%----------------------------------------- %-----------------------------------------
% functions % API: FUNCTIONS
%----------------------------------------- %-----------------------------------------
%-----------------------------------------
% aint nobody got time for case shit
%-----------------------------------------
% tokens
unsafe_tokens_from_file(F) ->
{ok, Tks} = tokens_from_file(F),
Tks.
unsafe_tokens_from_string(S) ->
{ok, Tks} = tokens_from_string(S),
Tks.
% signal
unsafe_signal_from_file(F) ->
{ok, Tks} = signal_from_file(F),
Tks.
unsafe_signal_from_string(S) ->
{ok, Tks} = signal_from_string(S),
Tks.
%
filter_signal(X) -> gsc_tokens:filter_significant(X).
signal_from_file(X) -> sigtokens_from_file(X).
signal_from_string(X) -> sigtokens_from_string(X).
% @doc legacy name for signal
sigtokens_from_file(X) -> sigtokens_from_file(X) ->
case tokens_from_file(X) of case tokens_from_file(X) of
{ok, Y} -> {ok, gs_tokens:filter_significant(Y)}; {ok, Y} -> {ok, gsc_tokens:filter_significant(Y)};
Err -> Err Err -> Err
end. end.
sigtokens_from_string(X) -> sigtokens_from_string(X) ->
case tokens_from_string(X) of case tokens_from_string(X) of
{ok, Y} -> {ok, gs_tokens:filter_significant(Y)}; {ok, Y} -> {ok, gsc_tokens:filter_significant(Y)};
Err -> Err Err -> Err
end. end.
@@ -81,7 +129,6 @@ tokens_from_file(FilePath) ->
-spec tokens_from_string(SrcStr) -> Result -spec tokens_from_string(SrcStr) -> Result
when SrcStr :: string(), when SrcStr :: string(),
Result :: {ok, Tokens} Result :: {ok, Tokens}
@@ -89,4 +136,72 @@ tokens_from_file(FilePath) ->
Tokens :: [tk()]. Tokens :: [tk()].
tokens_from_string(SrcStr) -> tokens_from_string(SrcStr) ->
gs_tokens:tokens(SrcStr). gsc_tokens:tokens(SrcStr).
-spec gso_tokens_from_file(FilePath) -> Result when
FilePath :: string(),
Result :: {ok, GsoTks} | {error, Reason},
GsoTks :: [gso_scan:so_token()],
Reason :: gsc_err() | any().
gso_tokens_from_file(FilePath) ->
case file:read_file(FilePath) of
{ok, Bytes} -> gso_tokens_from_string(Bytes);
Error -> Error
end.
-spec gso_tokens_from_string(Str) -> Result when
Str :: iolist(),
Result :: {ok, GsoTks} | {error, Reason},
GsoTks :: [gso_scan:so_token()],
Reason :: gsc_err() | any().
gso_tokens_from_string(Evil) ->
Str = gsc_tokens:very_stable_codepoints(Evil),
gso_scan:scan(Str).
-spec very_stable_codepoints(String) -> Normalized when
String :: iolist(),
Normalized :: string().
%% @doc normalize string to utf8 NFC list form
very_stable_codepoints(X) ->
gsc_tokens:very_stable_codepoints(X).
-spec very_stable_string(String) -> Normalized when
String :: iolist(),
Normalized :: string().
%% @doc alias for `very_stable_codepoints/1'
very_stable_string(X) ->
gsc_tokens:very_stable_codepoints(X).
-spec very_stable_file(FilePath) -> Contents when
FilePath :: string(),
Contents :: string().
%% @doc Read file, return contents as
%% `unicode:characters_to_nfc_list/1' list.
%%
%% Please note that this function is NOT in fact very
%% stable, as it throws an error if there's some error
%% reading the file (e.g. not found).
%%
%% this function exists mostly for scripting/shell
%% convenience
very_stable_file(X) ->
case file:read_file(X) of
{ok, B} -> very_stable_codepoints(B);
Error -> error(Error)
end.
+127
View File
@@ -0,0 +1,127 @@
-module(gsc_ntree).
-export_type([
ntree/2,
ntree/0
]).
-export([
nstem/2,
flatten/1,
deleaf0/1,
releaf0/2
]).
-include("$gsc_include/gsc.hrl").
%%=====================================================
%% API: types
%%=====================================================
-record(ns, {val :: any(), kids :: list(any())}).
-record(nl, {val :: any()}).
%% @doc ntree(S, L) is a "node tree" (meaning stems
%% have values and children)
-type ntree(S, L)
:: #ns{val :: S, kids :: [ntree(S, L)]}
| #nl{val :: L}.
-type ntree() :: ntree(any(), any()).
%%=====================================================
%% API: functions
%%=====================================================
-spec nstem(Root, List) -> Tree when
Root :: X,
List :: list(Y),
Tree :: ntree(X, Y),
X :: any(),
Y :: any().
% @doc
% You *probably* want `releaf0/2' instead.
%
% This function naively wraps each element in the list
% in a leaf type, even if it's already wrapped.
%
% nstem(root, [Foo, Bar, Baz]) ~>
% *s root
% |
% +--- .l Foo
% |
% +--- .l Bar
% |
% +--- .l Baz
%
% Much more common use case is to releaf only the input
% nodes which are not already wrapped, which is what
% `releaf0/2' does.
% @end
nstem(Root, List) ->
{ns, Root, [{nl, Y} || Y <- List]}.
-spec flatten(Tree) -> LeafVals when
Tree :: ntree(any(), LeafType),
LeafVals :: [LeafType],
LeafType :: any().
flatten({nl, X}) ->
[X];
flatten({ns, _, Keeids}) ->
lists:flatten([flatten(Keeid) || Keeid <- Keeids]).
-spec deleaf0(Tree) -> Result when
Tree :: ntree(S, L),
Result :: [L | Tree],
S :: any(),
L :: any().
% @doc unwrap the leaf children, and leave the stem
% children intact
%
% ex. 1:
% (+ 1 2 (* 3 4) 5)
% ~> '(1 2 (* 3 4) 5)
%
% ex. 2:
% {ns, '+', [{nl, 1},
% {nl, 2},
% {ns, '*', [{nl, 3}, {nl, 4}]},
% {nl, 5}]}
% ~> [1, 2, {ns, '*', [{nl, 3}, {nl, 4}]}, 5]
% @end
deleaf0({nl, L}) -> [L];
deleaf0({ns, _, Ls}) -> dl0([], Ls).
dl0(Stk, []) -> lists:reverse(Stk);
dl0(Stk, [{nl, X} | Rest]) -> dl0([X | Stk], Rest);
dl0(Stk, [X | Rest]) -> dl0([X | Stk], Rest).
-spec releaf0(Root, Keeids) -> Rooted when
Root :: S,
Keeids :: [L | ntree(S, L)],
Rooted :: ntree(S, L),
S :: any(),
L :: any().
% @doc notional inverse of `deleaf0/1'
%
% Note that this does **NOT** double-wrap leafs in the
% input
releaf0(Root, Ks) ->
#ns{val = Root,
kids = lists:map(fun rl0/1, Ks)}.
rl0(X = #ns{}) -> X;
rl0(X = #nl{}) -> X;
rl0(X) -> {nl, X}.
+1 -1
View File
@@ -70,7 +70,7 @@
% `contract` gets tokenized as a keyword and not a variable name), and then % `contract` gets tokenized as a keyword and not a variable name), and then
% calls into this module in order to match the string shape it's looking for. % calls into this module in order to match the string shape it's looking for.
% @end % @end
-module(gs_strmatch). -module(gsc_strmatch).
%-compile([export_all, nowarn_export_all]). %-compile([export_all, nowarn_export_all]).
+55 -27
View File
@@ -16,7 +16,7 @@
% 2. to future-proof in case we decide to incrementally incorporate the gsc % 2. to future-proof in case we decide to incrementally incorporate the gsc
% code into the legacy sophia compiler % code into the legacy sophia compiler
% @end % @end
-module(gs_tokens). -module(gsc_tokens).
% meta % meta
-export([ -export([
@@ -39,6 +39,9 @@
is_significant/1, is_significant/1,
filter_significant/1, filter_significant/1,
significant_tokens/1, significant_tokens/1,
very_stable_codepoints/1,
very_stable_string/1,
very_stable_characters/1,
tokens_from_iolist/1, tokens_from_iolist/1,
tokens/1, tokens/1,
slurp_token/2, slurp_token/2,
@@ -188,13 +191,13 @@ slurp_dlist(All, Opens, [#tk{str = "["} = Tk | NewTks]) ->
slurp_dlist(All, Opens, [#tk{str = "{"} = Tk | NewTks]) -> slurp_dlist(All, Opens, [#tk{str = "{"} = Tk | NewTks]) ->
slurp_dlist([Tk | All], [Tk | Opens], NewTks); slurp_dlist([Tk | All], [Tk | Opens], NewTks);
% sad: mismatch cases % sad: mismatch cases
slurp_dlist(All, Opens, []) -> slurp_dlist(_, Opens, []) ->
{error, {fixme, mismatch, Opens, none}}; {error, {fixme, mismatch, Opens, none}};
slurp_dlist(All, Opens, [#tk{str = "}"} = BadClose | _]) -> slurp_dlist(_, Opens, [#tk{str = "}"} = BadClose | _]) ->
{error, {fixme, mismatch, Opens, {value, BadClose}}}; {error, {fixme, mismatch, Opens, {value, BadClose}}};
slurp_dlist(All, Opens, [#tk{str = "]"} = BadClose | _]) -> slurp_dlist(_, Opens, [#tk{str = "]"} = BadClose | _]) ->
{error, {fixme, mismatch, Opens, {value, BadClose}}}; {error, {fixme, mismatch, Opens, {value, BadClose}}};
slurp_dlist(All, Opens, [#tk{str = ")"} = BadClose | _]) -> slurp_dlist(_, Opens, [#tk{str = ")"} = BadClose | _]) ->
{error, {fixme, mismatch, Opens, {value, BadClose}}}; {error, {fixme, mismatch, Opens, {value, BadClose}}};
% general case: non-terminal token gets pushed % general case: non-terminal token gets pushed
slurp_dlist(All, Opens, [Tk | NewTks]) -> slurp_dlist(All, Opens, [Tk | NewTks]) ->
@@ -330,6 +333,29 @@ is_significant(#tk{shape = ws}) -> false;
is_significant(_) -> true. is_significant(_) -> true.
% aliases
very_stable_string(X) -> very_stable_codepoints(X).
very_stable_characters(X) -> very_stable_codepoints(X).
-spec very_stable_codepoints(IoList) -> NfcList when
IoList :: iolist(),
NfcList :: string().
%% @doc When Unicode sends its characters, they're not
%% sending their best. They're not sending ASCII.
%% They're not sending ASCII. They're sending
%% characters that have lots of problems, and they're
%% bringing those problems with us. They're bringing
%% diacritics. They're bringing homoglyphs. They're
%% bringing RTL. They're rapists. And some, we assume,
%% are good characters.
very_stable_codepoints(S) ->
unicode:characters_to_nfc_list(S).
-spec tokens_from_iolist(SrcStr) -> Result when -spec tokens_from_iolist(SrcStr) -> Result when
SrcStr :: iolist(), SrcStr :: iolist(),
Result :: {ok, Tokens} Result :: {ok, Tokens}
@@ -341,6 +367,7 @@ tokens_from_iolist(S) -> tokens(S).
-spec tokens(SrcStr) -> Result -spec tokens(SrcStr) -> Result
when SrcStr :: iolist(), when SrcStr :: iolist(),
Result :: {ok, Tokens} Result :: {ok, Tokens}
@@ -355,7 +382,8 @@ tokens_from_iolist(S) -> tokens(S).
tokens(S) -> tokens(S) ->
% defensive normalization % defensive normalization
tokens([], {1, 1}, unicode:characters_to_nfc_list(S)). tokens([], {1, 1}, very_stable_codepoints(S)).
tokens(Stack, _FinalPos, "") -> tokens(Stack, _FinalPos, "") ->
{ok, lists:reverse(Stack)}; {ok, lists:reverse(Stack)};
@@ -559,8 +587,8 @@ slurp_token_of_shape(bcom, Pos, SrcStr0) ->
no_tokmatch no_tokmatch
end; end;
slurp_token_of_shape(ws, Pos, SrcStr) -> slurp_token_of_shape(ws, Pos, SrcStr) ->
WhitespaceMatcher = gs_strmatch:smr_sf_ws(), WhitespaceMatcher = gsc_strmatch:smr_sf_ws(),
case gs_strmatch:match(WhitespaceMatcher, SrcStr) of case gsc_strmatch:match(WhitespaceMatcher, SrcStr) of
no_strmatch -> no_strmatch ->
no_tokmatch; no_tokmatch;
{strmatch, WS, Rest} -> {strmatch, WS, Rest} ->
@@ -594,7 +622,7 @@ slurp_token_of_shape(kwd, Pos, SrcStr) ->
no_tokmatch no_tokmatch
end; end;
slurp_token_of_shape(op, Pos, SrcStr) -> slurp_token_of_shape(op, Pos, SrcStr) ->
case gs_strmatch:match(gs_strmatch:smr_sf_op(), SrcStr) of case gsc_strmatch:match(gsc_strmatch:smr_sf_op(), SrcStr) of
{strmatch, Str, Rest} -> {strmatch, Str, Rest} ->
Token = #tk{shape = op, pos = Pos, str = Str}, Token = #tk{shape = op, pos = Pos, str = Str},
{tokmatch, Token, Rest}; {tokmatch, Token, Rest};
@@ -602,7 +630,7 @@ slurp_token_of_shape(op, Pos, SrcStr) ->
no_tokmatch no_tokmatch
end; end;
slurp_token_of_shape(punct, Pos, SrcStr) -> slurp_token_of_shape(punct, Pos, SrcStr) ->
case gs_strmatch:match(gs_strmatch:smr_sf_punct(), SrcStr) of case gsc_strmatch:match(gsc_strmatch:smr_sf_punct(), SrcStr) of
{strmatch, Str, Rest} -> {strmatch, Str, Rest} ->
Token = #tk{shape = punct, pos = Pos, str = Str}, Token = #tk{shape = punct, pos = Pos, str = Str},
{tokmatch, Token, Rest}; {tokmatch, Token, Rest};
@@ -611,7 +639,7 @@ slurp_token_of_shape(punct, Pos, SrcStr) ->
end; end;
% SOPHIA VARIABLE NAMES: id, con, qid, qcon, tvar % SOPHIA VARIABLE NAMES: id, con, qid, qcon, tvar
slurp_token_of_shape(id, Pos, SrcStr) -> slurp_token_of_shape(id, Pos, SrcStr) ->
case gs_strmatch:match(gs_strmatch:smr_sf_id(), SrcStr) of case gsc_strmatch:match(gsc_strmatch:smr_sf_id(), SrcStr) of
{strmatch, IdStr, Rest} -> {strmatch, IdStr, Rest} ->
Token = #tk{shape = id, pos = Pos, str = IdStr}, Token = #tk{shape = id, pos = Pos, str = IdStr},
{tokmatch, Token, Rest}; {tokmatch, Token, Rest};
@@ -619,7 +647,7 @@ slurp_token_of_shape(id, Pos, SrcStr) ->
no_tokmatch no_tokmatch
end; end;
slurp_token_of_shape(con, Pos, SrcStr) -> slurp_token_of_shape(con, Pos, SrcStr) ->
case gs_strmatch:match(gs_strmatch:smr_sf_con(), SrcStr) of case gsc_strmatch:match(gsc_strmatch:smr_sf_con(), SrcStr) of
{strmatch, Str, Rest} -> {strmatch, Str, Rest} ->
Token = #tk{shape = con, pos = Pos, str = Str}, Token = #tk{shape = con, pos = Pos, str = Str},
{tokmatch, Token, Rest}; {tokmatch, Token, Rest};
@@ -627,7 +655,7 @@ slurp_token_of_shape(con, Pos, SrcStr) ->
no_tokmatch no_tokmatch
end; end;
slurp_token_of_shape(qid, Pos, SrcStr) -> slurp_token_of_shape(qid, Pos, SrcStr) ->
case gs_strmatch:match(gs_strmatch:smr_sf_qid(), SrcStr) of case gsc_strmatch:match(gsc_strmatch:smr_sf_qid(), SrcStr) of
{strmatch, Str, Rest} -> {strmatch, Str, Rest} ->
Token = #tk{shape = qid, pos = Pos, str = Str}, Token = #tk{shape = qid, pos = Pos, str = Str},
{tokmatch, Token, Rest}; {tokmatch, Token, Rest};
@@ -635,7 +663,7 @@ slurp_token_of_shape(qid, Pos, SrcStr) ->
no_tokmatch no_tokmatch
end; end;
slurp_token_of_shape(qcon, Pos, SrcStr) -> slurp_token_of_shape(qcon, Pos, SrcStr) ->
case gs_strmatch:match(gs_strmatch:smr_sf_qcon(), SrcStr) of case gsc_strmatch:match(gsc_strmatch:smr_sf_qcon(), SrcStr) of
{strmatch, Str, Rest} -> {strmatch, Str, Rest} ->
Token = #tk{shape = qcon, pos = Pos, str = Str}, Token = #tk{shape = qcon, pos = Pos, str = Str},
{tokmatch, Token, Rest}; {tokmatch, Token, Rest};
@@ -643,7 +671,7 @@ slurp_token_of_shape(qcon, Pos, SrcStr) ->
no_tokmatch no_tokmatch
end; end;
slurp_token_of_shape(tvar, Pos, SrcStr) -> slurp_token_of_shape(tvar, Pos, SrcStr) ->
case gs_strmatch:match(gs_strmatch:smr_sf_tvar(), SrcStr) of case gsc_strmatch:match(gsc_strmatch:smr_sf_tvar(), SrcStr) of
{strmatch, Str, Rest} -> {strmatch, Str, Rest} ->
Token = #tk{shape = tvar, pos = Pos, str = Str}, Token = #tk{shape = tvar, pos = Pos, str = Str},
{tokmatch, Token, Rest}; {tokmatch, Token, Rest};
@@ -651,7 +679,7 @@ slurp_token_of_shape(tvar, Pos, SrcStr) ->
no_tokmatch no_tokmatch
end; end;
slurp_token_of_shape(int16, Pos, SrcStr) -> slurp_token_of_shape(int16, Pos, SrcStr) ->
case gs_strmatch:match(gs_strmatch:smr_sf_int16(), SrcStr) of case gsc_strmatch:match(gsc_strmatch:smr_sf_int16(), SrcStr) of
{strmatch, Str, Rest} -> {strmatch, Str, Rest} ->
Token = #tk{shape = int16, pos = Pos, str = Str}, Token = #tk{shape = int16, pos = Pos, str = Str},
{tokmatch, Token, Rest}; {tokmatch, Token, Rest};
@@ -659,7 +687,7 @@ slurp_token_of_shape(int16, Pos, SrcStr) ->
no_tokmatch no_tokmatch
end; end;
slurp_token_of_shape(int10, Pos, SrcStr) -> slurp_token_of_shape(int10, Pos, SrcStr) ->
case gs_strmatch:match(gs_strmatch:smr_sf_int10(), SrcStr) of case gsc_strmatch:match(gsc_strmatch:smr_sf_int10(), SrcStr) of
{strmatch, Str, Rest} -> {strmatch, Str, Rest} ->
Token = #tk{shape = int10, pos = Pos, str = Str}, Token = #tk{shape = int10, pos = Pos, str = Str},
{tokmatch, Token, Rest}; {tokmatch, Token, Rest};
@@ -671,8 +699,8 @@ slurp_token_of_shape(int10, Pos, SrcStr) ->
% %
% char: sophia char literal % char: sophia char literal
slurp_token_of_shape(ak, Pos, SrcStr) -> slurp_token_of_shape(ak, Pos, SrcStr) ->
StringMatcher = gs_strmatch:smr_sf_ak(), StringMatcher = gsc_strmatch:smr_sf_ak(),
case gs_strmatch:match(StringMatcher, SrcStr) of case gsc_strmatch:match(StringMatcher, SrcStr) of
no_strmatch -> no_strmatch ->
no_tokmatch; no_tokmatch;
{strmatch, TokenStr, Rest} -> {strmatch, TokenStr, Rest} ->
@@ -680,8 +708,8 @@ slurp_token_of_shape(ak, Pos, SrcStr) ->
{tokmatch, Token, Rest} {tokmatch, Token, Rest}
end; end;
slurp_token_of_shape(ct, Pos, SrcStr) -> slurp_token_of_shape(ct, Pos, SrcStr) ->
StringMatcher = gs_strmatch:smr_sf_ct(), StringMatcher = gsc_strmatch:smr_sf_ct(),
case gs_strmatch:match(StringMatcher, SrcStr) of case gsc_strmatch:match(StringMatcher, SrcStr) of
no_strmatch -> no_strmatch ->
no_tokmatch; no_tokmatch;
{strmatch, TokenStr, Rest} -> {strmatch, TokenStr, Rest} ->
@@ -689,8 +717,8 @@ slurp_token_of_shape(ct, Pos, SrcStr) ->
{tokmatch, Token, Rest} {tokmatch, Token, Rest}
end; end;
slurp_token_of_shape(sg, Pos, SrcStr) -> slurp_token_of_shape(sg, Pos, SrcStr) ->
StringMatcher = gs_strmatch:smr_sf_sg(), StringMatcher = gsc_strmatch:smr_sf_sg(),
case gs_strmatch:match(StringMatcher, SrcStr) of case gsc_strmatch:match(StringMatcher, SrcStr) of
no_strmatch -> no_strmatch ->
no_tokmatch; no_tokmatch;
{strmatch, TokenStr, Rest} -> {strmatch, TokenStr, Rest} ->
@@ -698,8 +726,8 @@ slurp_token_of_shape(sg, Pos, SrcStr) ->
{tokmatch, Token, Rest} {tokmatch, Token, Rest}
end; end;
slurp_token_of_shape(char, Pos, SrcStr) -> slurp_token_of_shape(char, Pos, SrcStr) ->
StringMatcher = gs_strmatch:smr_sf_char(), StringMatcher = gsc_strmatch:smr_sf_char(),
case gs_strmatch:match(StringMatcher, SrcStr) of case gsc_strmatch:match(StringMatcher, SrcStr) of
no_strmatch -> no_strmatch ->
no_tokmatch; no_tokmatch;
{strmatch, TokenStr, Rest} -> {strmatch, TokenStr, Rest} ->
@@ -707,7 +735,7 @@ slurp_token_of_shape(char, Pos, SrcStr) ->
{tokmatch, Token, Rest} {tokmatch, Token, Rest}
end; end;
slurp_token_of_shape(string, Pos, SrcStr) -> slurp_token_of_shape(string, Pos, SrcStr) ->
case gs_strmatch:match(gs_strmatch:smr_sf_str(), SrcStr) of case gsc_strmatch:match(gsc_strmatch:smr_sf_str(), SrcStr) of
no_strmatch -> no_strmatch ->
no_tokmatch; no_tokmatch;
{strmatch, TokenStr, Rest} -> {strmatch, TokenStr, Rest} ->
@@ -715,7 +743,7 @@ slurp_token_of_shape(string, Pos, SrcStr) ->
{tokmatch, Token, Rest} {tokmatch, Token, Rest}
end; end;
slurp_token_of_shape(bytes, Pos, SrcStr) -> slurp_token_of_shape(bytes, Pos, SrcStr) ->
case gs_strmatch:match(gs_strmatch:smr_sf_bytes(), SrcStr) of case gsc_strmatch:match(gsc_strmatch:smr_sf_bytes(), SrcStr) of
no_strmatch -> no_strmatch ->
no_tokmatch; no_tokmatch;
{strmatch, TokenStr, Rest} -> {strmatch, TokenStr, Rest} ->
+2 -2
View File
@@ -1,6 +1,6 @@
% @doc compatibility layer to test against so_scan % @doc compatibility layer to test against so_scan
% %
% converts gs_tokens data to so_scan tokens % converts gsc_tokens data to so_scan tokens
% %
% Ref: so_scan.erl % Ref: so_scan.erl
-module(gso_scan). -module(gso_scan).
@@ -104,7 +104,7 @@
% @end % @end
scan(SrcStr) -> scan(SrcStr) ->
case gs_tokens:tokens(SrcStr) of case gsc_tokens:tokens(SrcStr) of
{ok, SfLTokens} -> {ok, SfLTokens} ->
SoTokens = to_so_tokens(SfLTokens), SoTokens = to_so_tokens(SfLTokens),
{ok, SoTokens}; {ok, SoTokens};