wip restructuring

2026-06-05 13:36:01 -07:00
parent f04b7311f5
commit f79403b97f
55 changed files with 29 additions and 85 deletions
@@ -0,0 +1,165 @@
+% gsc tokenizer tests
+-module(gsc_test_tokens).
+
+-export([
+    main/0, ct_dir/0
+    %tokens_match/1
+]).
+-include("$gsc_include/gsc.hrl").
+-include_lib("eunit/include/eunit.hrl").
+
+main() ->
+    %io:format("~p~n", [div_files()]),
+    %io:format("MAINNNNN!~n", []),
+    eunit:test(?MODULE, [verbose]).
+    %eunit:test(?MODULE).
+
+
+% directory containing the tests for the tokenizer
+ct_dir() ->
+    zx_daemon:get_home() ++ "/ct".
+
+agreement_tests_dir() ->
+    ct_dir() ++ "/tokenizers_agree".
+
+
+% the divergences claude found between gsc tokenizer and so tokenizer
+%
+% mostly stupid corner cases like a string crossing a line boundary
+% or unterminated block comment
+%
+% divergence files: "divergence" means so_scan disagrees with
+% gsc_so_scan in one of the following ways:
+%
+%   - one succeeds when the other errors
+%   - disagree on success case
+%
+% making errors agree on two programs that work differently is a
+% fool's errand
+div_files() ->
+    ContractsDir = agreement_tests_dir(),
+    % this is the equivalent of ls
+    % just has filenames, no /path/to/ prefix
+    {ok, Files} = file:list_dir(ContractsDir),
+    % originally i was a retard and didn't read the eunit
+    % documentation, so if any one test failed, the entire test suite
+    % would crash with no information regarding what happened
+    %
+    % so this was a hack to only run div01-div05 but not div06:
+    %
+    %   % hack to fix one broken test at a time
+    %   (FileName = "div0" ++ [Digit | _]) when Digit =< $9 ->
+    %       FilePath = ct_dir() ++ "/" ++ FileName,
+    %       {true, {FileName, FilePath}}
+    %   (_) -> false
+    %
+    % Once i read the eunit docs and learned about test generators, I
+    % realized I could have only the failed test chimp out. what a
+    % concept.
+    %
+    % i also realized that printing the full filepath was a waste, so
+    % instead the test should know about the FileName (foo.bar) and the
+    % FilePath (/path/to/foo.bar).
+    %
+    % then i decided to start writing my own test contracts instead
+    % of having claude do it and i rean into the issue of vim swap
+    % files not lexing properly because they're not unicode
+    IsDivCt =
+        fun(FileName) ->
+            % need to filter out vim swap files
+            % originally was false-matching on ([$. | _])
+            % like a man
+            %
+            % god this feels like putting my balls in a little tiny
+            % guillotine (even the guillotine is emasculating) but
+            % claude suggested this and i mean it's kind of the
+            % most idiomatic and like straightforward. most
+            % importantly it's declarative
+            %
+            % god i feel so defeated
+            case filename:extension(FileName) of
+                ".aes" ->
+                    FilePath = ContractsDir ++ "/" ++ FileName,
+                    {true, {FileName, FilePath}};
+                _ ->
+                    false
+            end
+        end,
+    lists:sort(lists:filtermap(IsDivCt, Files)).
+
+
+%div_file_names() -> [N || {N, _} <- div_files()].
+%div_file_paths() -> [P || {_, P} <- div_files()].
+
+tokstr_concat_test_() ->
+    % future proofing
+    ConcatTestFiles
+        = lists:flatten([
+            div_files()
+        ]),
+    % exclude the contracts with like unterminated block comments
+    % where they don't tokenize properly
+    NonStupidFiles =
+        lists:filter(
+            fun
+                ({"div05_bcom_eof.aes", _}) -> false;
+                ({"div06_bcom_in_expr.aes", _}) -> false;
+                ({"div07_bcom_nested.aes", _}) -> false;
+                ({"div08_bcom_simple.aes", _}) -> false;
+                ({_, _}) -> true
+            end,
+            ConcatTestFiles
+        ),
+    %?debugFmt("ConcatTestFiles=~p", [ConcatTestFiles]),
+    {"file = sum(tokens)",
+     [concat_property(Name, Path) || {Name, Path} <- NonStupidFiles]}.
+
+concat_property(FileName, FilePath) ->
+    %?debugFmt("concat_property(~p, _)", [FileName]),
+    FileChars = gsc:very_stable_file(FilePath),
+    {FileName ++ ": file = sum(tokens)",
+     fun() ->
+        case gsc:tokens_from_file(FileChars) of
+            {ok, SfcTokens} ->
+                ConcatStr = concat_token_strs(SfcTokens, []),
+                ?assertEqual(FileChars, ConcatStr);
+            _Error ->
+                ok
+        end
+     end}.
+
+concat_token_strs([#tk{str = S} | Rest], Acc) ->
+     concat_token_strs(Rest, [Acc, S]);
+concat_token_strs([], Acc) ->
+    unicode:characters_to_nfc_list(Acc).
+
+% underscore marks this as a test *generator*
+div_test_() ->
+    % divergence
+    DivFiles = div_files(),
+    %?debugFmt("DivFiles=~p", [DivFiles]),
+    {"claude tokenizer divergences fixed",
+     [tokens_match(N, P) || {N, P} <- DivFiles]}.
+
+tokens_match(FileName, FilePath) ->
+    %?debugFmt("tokens_match(~p, _)", [FileName]),
+    % extracting data to be tested
+    % i hate this so much but lazy and this is test code so who really cares. 
+    SoTokens = so_tokens_from_file(FilePath),
+    SfTokens = gsc:gso_tokens_from_file(FilePath),
+    {FileName ++ ": tokenizers_agree",
+     fun() ->
+          case {SoTokens, SfTokens} of
+              {{ok, So},    {ok, Sf}}  -> ?assertEqual(So, Sf);
+              {{error, _}, {error, _}} -> ok;
+              {{ok, _},    {error, _}} -> error("so_scan succeeded and gso_scan failed");
+              {{error, _}, {ok, _}}    -> error("so_scan failed and gso_scan succeded")
+          end
+     end}.
+
+% that's right, we have to enter via converting the
+% bytes in the file to a list... lol
+so_tokens_from_file(F) ->
+    {ok, Bytes} = file:read_file(F),
+    S = binary_to_list(Bytes),
+    so_scan:scan(S).