gsc/test/gsc_test_tokens.erl

% gsc tokenizer tests
-module(gsc_test_tokens).

-export([
    main/0, ct_dir/0
    %tokens_match/1
]).
-include("$gsc_include/gsc.hrl").
-include_lib("eunit/include/eunit.hrl").

main() ->
    %io:format("~p~n", [div_files()]),
    %io:format("MAINNNNN!~n", []),
    eunit:test(?MODULE, [verbose]).
    %eunit:test(?MODULE).


% directory containing the tests for the tokenizer
ct_dir() ->
    zx_daemon:get_home() ++ "/ct".

agreement_tests_dir() ->
    ct_dir() ++ "/tokenizers_agree".


% the divergences claude found between gsc tokenizer and so tokenizer
%
% mostly stupid corner cases like a string crossing a line boundary
% or unterminated block comment
%
% divergence files: "divergence" means so_scan disagrees with
% gsc_so_scan in one of the following ways:
%
%   - one succeeds when the other errors
%   - disagree on success case
%
% making errors agree on two programs that work differently is a
% fool's errand
div_files() ->
    ContractsDir = agreement_tests_dir(),
    % this is the equivalent of ls
    % just has filenames, no /path/to/ prefix
    {ok, Files} = file:list_dir(ContractsDir),
    % originally i was a retard and didn't read the eunit
    % documentation, so if any one test failed, the entire test suite
    % would crash with no information regarding what happened
    %
    % so this was a hack to only run div01-div05 but not div06:
    %
    %   % hack to fix one broken test at a time
    %   (FileName = "div0" ++ [Digit | _]) when Digit =< $9 ->
    %       FilePath = ct_dir() ++ "/" ++ FileName,
    %       {true, {FileName, FilePath}}
    %   (_) -> false
    %
    % Once i read the eunit docs and learned about test generators, I
    % realized I could have only the failed test chimp out. what a
    % concept.
    %
    % i also realized that printing the full filepath was a waste, so
    % instead the test should know about the FileName (foo.bar) and the
    % FilePath (/path/to/foo.bar).
    %
    % then i decided to start writing my own test contracts instead
    % of having claude do it and i rean into the issue of vim swap
    % files not lexing properly because they're not unicode
    IsDivCt =
        fun(FileName) ->
            % need to filter out vim swap files
            % originally was false-matching on ([$. | _])
            % like a man
            %
            % god this feels like putting my balls in a little tiny
            % guillotine (even the guillotine is emasculating) but
            % claude suggested this and i mean it's kind of the
            % most idiomatic and like straightforward. most
            % importantly it's declarative
            %
            % god i feel so defeated
            case filename:extension(FileName) of
                ".aes" ->
                    FilePath = ContractsDir ++ "/" ++ FileName,
                    {true, {FileName, FilePath}};
                _ ->
                    false
            end
        end,
    lists:sort(lists:filtermap(IsDivCt, Files)).


%div_file_names() -> [N || {N, _} <- div_files()].
%div_file_paths() -> [P || {_, P} <- div_files()].

tokstr_concat_test_() ->
    % future proofing
    ConcatTestFiles
        = lists:flatten([
            div_files()
        ]),
    % exclude the contracts with like unterminated block comments
    % where they don't tokenize properly
    NonStupidFiles =
        lists:filter(
            fun
                ({"div05_bcom_eof.aes", _}) -> false;
                ({"div06_bcom_in_expr.aes", _}) -> false;
                ({"div07_bcom_nested.aes", _}) -> false;
                ({"div08_bcom_simple.aes", _}) -> false;
                ({_, _}) -> true
            end,
            ConcatTestFiles
        ),
    %?debugFmt("ConcatTestFiles=~p", [ConcatTestFiles]),
    {"file = sum(tokens)",
     [concat_property(Name, Path) || {Name, Path} <- NonStupidFiles]}.

concat_property(FileName, FilePath) ->
    %?debugFmt("concat_property(~p, _)", [FileName]),
    FileChars = gsc:very_stable_file(FilePath),
    {FileName ++ ": file = sum(tokens)",
     fun() ->
        case gsc:tokens_from_file(FileChars) of
            {ok, SfcTokens} ->
                ConcatStr = concat_token_strs(SfcTokens, []),
                ?assertEqual(FileChars, ConcatStr);
            _Error ->
                ok
        end
     end}.

concat_token_strs([#tk{str = S} | Rest], Acc) ->
     concat_token_strs(Rest, [Acc, S]);
concat_token_strs([], Acc) ->
    unicode:characters_to_nfc_list(Acc).

% underscore marks this as a test *generator*
div_test_() ->
    % divergence
    DivFiles = div_files(),
    %?debugFmt("DivFiles=~p", [DivFiles]),
    {"claude tokenizer divergences fixed",
     [tokens_match(N, P) || {N, P} <- DivFiles]}.

tokens_match(FileName, FilePath) ->
    %?debugFmt("tokens_match(~p, _)", [FileName]),
    % extracting data to be tested
    % i hate this so much but lazy and this is test code so who really cares.
    SoTokens = so_tokens_from_file(FilePath),
    SfTokens = gsc:gso_tokens_from_file(FilePath),
    {FileName ++ ": tokenizers_agree",
     fun() ->
          case {SoTokens, SfTokens} of
              {{ok, So},    {ok, Sf}}  -> ?assertEqual(So, Sf);
              {{error, _}, {error, _}} -> ok;
              {{ok, _},    {error, _}} -> error("so_scan succeeded and gso_scan failed");
              {{error, _}, {ok, _}}    -> error("so_scan failed and gso_scan succeded")
          end
     end}.

% that's right, we have to enter via converting the
% bytes in the file to a list... lol
so_tokens_from_file(F) ->
    {ok, Bytes} = file:read_file(F),
    S = binary_to_list(Bytes),
    so_scan:scan(S).