Files
gsc/test/gsc_test_tokens.erl
T
2026-06-05 13:36:01 -07:00

166 lines
5.5 KiB
Erlang

% gsc tokenizer tests
-module(gsc_test_tokens).
-export([
main/0, ct_dir/0
%tokens_match/1
]).
-include("$gsc_include/gsc.hrl").
-include_lib("eunit/include/eunit.hrl").
main() ->
%io:format("~p~n", [div_files()]),
%io:format("MAINNNNN!~n", []),
eunit:test(?MODULE, [verbose]).
%eunit:test(?MODULE).
% directory containing the tests for the tokenizer
ct_dir() ->
zx_daemon:get_home() ++ "/ct".
agreement_tests_dir() ->
ct_dir() ++ "/tokenizers_agree".
% the divergences claude found between gsc tokenizer and so tokenizer
%
% mostly stupid corner cases like a string crossing a line boundary
% or unterminated block comment
%
% divergence files: "divergence" means so_scan disagrees with
% gsc_so_scan in one of the following ways:
%
% - one succeeds when the other errors
% - disagree on success case
%
% making errors agree on two programs that work differently is a
% fool's errand
div_files() ->
ContractsDir = agreement_tests_dir(),
% this is the equivalent of ls
% just has filenames, no /path/to/ prefix
{ok, Files} = file:list_dir(ContractsDir),
% originally i was a retard and didn't read the eunit
% documentation, so if any one test failed, the entire test suite
% would crash with no information regarding what happened
%
% so this was a hack to only run div01-div05 but not div06:
%
% % hack to fix one broken test at a time
% (FileName = "div0" ++ [Digit | _]) when Digit =< $9 ->
% FilePath = ct_dir() ++ "/" ++ FileName,
% {true, {FileName, FilePath}}
% (_) -> false
%
% Once i read the eunit docs and learned about test generators, I
% realized I could have only the failed test chimp out. what a
% concept.
%
% i also realized that printing the full filepath was a waste, so
% instead the test should know about the FileName (foo.bar) and the
% FilePath (/path/to/foo.bar).
%
% then i decided to start writing my own test contracts instead
% of having claude do it and i rean into the issue of vim swap
% files not lexing properly because they're not unicode
IsDivCt =
fun(FileName) ->
% need to filter out vim swap files
% originally was false-matching on ([$. | _])
% like a man
%
% god this feels like putting my balls in a little tiny
% guillotine (even the guillotine is emasculating) but
% claude suggested this and i mean it's kind of the
% most idiomatic and like straightforward. most
% importantly it's declarative
%
% god i feel so defeated
case filename:extension(FileName) of
".aes" ->
FilePath = ContractsDir ++ "/" ++ FileName,
{true, {FileName, FilePath}};
_ ->
false
end
end,
lists:sort(lists:filtermap(IsDivCt, Files)).
%div_file_names() -> [N || {N, _} <- div_files()].
%div_file_paths() -> [P || {_, P} <- div_files()].
tokstr_concat_test_() ->
% future proofing
ConcatTestFiles
= lists:flatten([
div_files()
]),
% exclude the contracts with like unterminated block comments
% where they don't tokenize properly
NonStupidFiles =
lists:filter(
fun
({"div05_bcom_eof.aes", _}) -> false;
({"div06_bcom_in_expr.aes", _}) -> false;
({"div07_bcom_nested.aes", _}) -> false;
({"div08_bcom_simple.aes", _}) -> false;
({_, _}) -> true
end,
ConcatTestFiles
),
%?debugFmt("ConcatTestFiles=~p", [ConcatTestFiles]),
{"file = sum(tokens)",
[concat_property(Name, Path) || {Name, Path} <- NonStupidFiles]}.
concat_property(FileName, FilePath) ->
%?debugFmt("concat_property(~p, _)", [FileName]),
FileChars = gsc:very_stable_file(FilePath),
{FileName ++ ": file = sum(tokens)",
fun() ->
case gsc:tokens_from_file(FileChars) of
{ok, SfcTokens} ->
ConcatStr = concat_token_strs(SfcTokens, []),
?assertEqual(FileChars, ConcatStr);
_Error ->
ok
end
end}.
concat_token_strs([#tk{str = S} | Rest], Acc) ->
concat_token_strs(Rest, [Acc, S]);
concat_token_strs([], Acc) ->
unicode:characters_to_nfc_list(Acc).
% underscore marks this as a test *generator*
div_test_() ->
% divergence
DivFiles = div_files(),
%?debugFmt("DivFiles=~p", [DivFiles]),
{"claude tokenizer divergences fixed",
[tokens_match(N, P) || {N, P} <- DivFiles]}.
tokens_match(FileName, FilePath) ->
%?debugFmt("tokens_match(~p, _)", [FileName]),
% extracting data to be tested
% i hate this so much but lazy and this is test code so who really cares.
SoTokens = so_tokens_from_file(FilePath),
SfTokens = gsc:gso_tokens_from_file(FilePath),
{FileName ++ ": tokenizers_agree",
fun() ->
case {SoTokens, SfTokens} of
{{ok, So}, {ok, Sf}} -> ?assertEqual(So, Sf);
{{error, _}, {error, _}} -> ok;
{{ok, _}, {error, _}} -> error("so_scan succeeded and gso_scan failed");
{{error, _}, {ok, _}} -> error("so_scan failed and gso_scan succeded")
end
end}.
% that's right, we have to enter via converting the
% bytes in the file to a list... lol
so_tokens_from_file(F) ->
{ok, Bytes} = file:read_file(F),
S = binary_to_list(Bytes),
so_scan:scan(S).