166 lines
5.5 KiB
Erlang
166 lines
5.5 KiB
Erlang
% gsc tokenizer tests
|
|
-module(gsc_test_tokens).
|
|
|
|
-export([
|
|
main/0, ct_dir/0
|
|
%tokens_match/1
|
|
]).
|
|
-include("$gsc_include/gsc.hrl").
|
|
-include_lib("eunit/include/eunit.hrl").
|
|
|
|
main() ->
|
|
%io:format("~p~n", [div_files()]),
|
|
%io:format("MAINNNNN!~n", []),
|
|
eunit:test(?MODULE, [verbose]).
|
|
%eunit:test(?MODULE).
|
|
|
|
|
|
% directory containing the tests for the tokenizer
|
|
ct_dir() ->
|
|
zx_daemon:get_home() ++ "/ct".
|
|
|
|
agreement_tests_dir() ->
|
|
ct_dir() ++ "/tokenizers_agree".
|
|
|
|
|
|
% the divergences claude found between gsc tokenizer and so tokenizer
|
|
%
|
|
% mostly stupid corner cases like a string crossing a line boundary
|
|
% or unterminated block comment
|
|
%
|
|
% divergence files: "divergence" means so_scan disagrees with
|
|
% gsc_so_scan in one of the following ways:
|
|
%
|
|
% - one succeeds when the other errors
|
|
% - disagree on success case
|
|
%
|
|
% making errors agree on two programs that work differently is a
|
|
% fool's errand
|
|
div_files() ->
|
|
ContractsDir = agreement_tests_dir(),
|
|
% this is the equivalent of ls
|
|
% just has filenames, no /path/to/ prefix
|
|
{ok, Files} = file:list_dir(ContractsDir),
|
|
% originally i was a retard and didn't read the eunit
|
|
% documentation, so if any one test failed, the entire test suite
|
|
% would crash with no information regarding what happened
|
|
%
|
|
% so this was a hack to only run div01-div05 but not div06:
|
|
%
|
|
% % hack to fix one broken test at a time
|
|
% (FileName = "div0" ++ [Digit | _]) when Digit =< $9 ->
|
|
% FilePath = ct_dir() ++ "/" ++ FileName,
|
|
% {true, {FileName, FilePath}}
|
|
% (_) -> false
|
|
%
|
|
% Once i read the eunit docs and learned about test generators, I
|
|
% realized I could have only the failed test chimp out. what a
|
|
% concept.
|
|
%
|
|
% i also realized that printing the full filepath was a waste, so
|
|
% instead the test should know about the FileName (foo.bar) and the
|
|
% FilePath (/path/to/foo.bar).
|
|
%
|
|
% then i decided to start writing my own test contracts instead
|
|
% of having claude do it and i rean into the issue of vim swap
|
|
% files not lexing properly because they're not unicode
|
|
IsDivCt =
|
|
fun(FileName) ->
|
|
% need to filter out vim swap files
|
|
% originally was false-matching on ([$. | _])
|
|
% like a man
|
|
%
|
|
% god this feels like putting my balls in a little tiny
|
|
% guillotine (even the guillotine is emasculating) but
|
|
% claude suggested this and i mean it's kind of the
|
|
% most idiomatic and like straightforward. most
|
|
% importantly it's declarative
|
|
%
|
|
% god i feel so defeated
|
|
case filename:extension(FileName) of
|
|
".aes" ->
|
|
FilePath = ContractsDir ++ "/" ++ FileName,
|
|
{true, {FileName, FilePath}};
|
|
_ ->
|
|
false
|
|
end
|
|
end,
|
|
lists:sort(lists:filtermap(IsDivCt, Files)).
|
|
|
|
|
|
%div_file_names() -> [N || {N, _} <- div_files()].
|
|
%div_file_paths() -> [P || {_, P} <- div_files()].
|
|
|
|
tokstr_concat_test_() ->
|
|
% future proofing
|
|
ConcatTestFiles
|
|
= lists:flatten([
|
|
div_files()
|
|
]),
|
|
% exclude the contracts with like unterminated block comments
|
|
% where they don't tokenize properly
|
|
NonStupidFiles =
|
|
lists:filter(
|
|
fun
|
|
({"div05_bcom_eof.aes", _}) -> false;
|
|
({"div06_bcom_in_expr.aes", _}) -> false;
|
|
({"div07_bcom_nested.aes", _}) -> false;
|
|
({"div08_bcom_simple.aes", _}) -> false;
|
|
({_, _}) -> true
|
|
end,
|
|
ConcatTestFiles
|
|
),
|
|
%?debugFmt("ConcatTestFiles=~p", [ConcatTestFiles]),
|
|
{"file = sum(tokens)",
|
|
[concat_property(Name, Path) || {Name, Path} <- NonStupidFiles]}.
|
|
|
|
concat_property(FileName, FilePath) ->
|
|
%?debugFmt("concat_property(~p, _)", [FileName]),
|
|
FileChars = gsc:very_stable_file(FilePath),
|
|
{FileName ++ ": file = sum(tokens)",
|
|
fun() ->
|
|
case gsc:tokens_from_file(FileChars) of
|
|
{ok, SfcTokens} ->
|
|
ConcatStr = concat_token_strs(SfcTokens, []),
|
|
?assertEqual(FileChars, ConcatStr);
|
|
_Error ->
|
|
ok
|
|
end
|
|
end}.
|
|
|
|
concat_token_strs([#tk{str = S} | Rest], Acc) ->
|
|
concat_token_strs(Rest, [Acc, S]);
|
|
concat_token_strs([], Acc) ->
|
|
unicode:characters_to_nfc_list(Acc).
|
|
|
|
% underscore marks this as a test *generator*
|
|
div_test_() ->
|
|
% divergence
|
|
DivFiles = div_files(),
|
|
%?debugFmt("DivFiles=~p", [DivFiles]),
|
|
{"claude tokenizer divergences fixed",
|
|
[tokens_match(N, P) || {N, P} <- DivFiles]}.
|
|
|
|
tokens_match(FileName, FilePath) ->
|
|
%?debugFmt("tokens_match(~p, _)", [FileName]),
|
|
% extracting data to be tested
|
|
% i hate this so much but lazy and this is test code so who really cares.
|
|
SoTokens = so_tokens_from_file(FilePath),
|
|
SfTokens = gsc:gso_tokens_from_file(FilePath),
|
|
{FileName ++ ": tokenizers_agree",
|
|
fun() ->
|
|
case {SoTokens, SfTokens} of
|
|
{{ok, So}, {ok, Sf}} -> ?assertEqual(So, Sf);
|
|
{{error, _}, {error, _}} -> ok;
|
|
{{ok, _}, {error, _}} -> error("so_scan succeeded and gso_scan failed");
|
|
{{error, _}, {ok, _}} -> error("so_scan failed and gso_scan succeded")
|
|
end
|
|
end}.
|
|
|
|
% that's right, we have to enter via converting the
|
|
% bytes in the file to a list... lol
|
|
so_tokens_from_file(F) ->
|
|
{ok, Bytes} = file:read_file(F),
|
|
S = binary_to_list(Bytes),
|
|
so_scan:scan(S).
|