wip restructuring
This commit is contained in:
@@ -0,0 +1,165 @@
|
||||
% gsc tokenizer tests
|
||||
-module(gsc_test_tokens).
|
||||
|
||||
-export([
|
||||
main/0, ct_dir/0
|
||||
%tokens_match/1
|
||||
]).
|
||||
-include("$gsc_include/gsc.hrl").
|
||||
-include_lib("eunit/include/eunit.hrl").
|
||||
|
||||
main() ->
|
||||
%io:format("~p~n", [div_files()]),
|
||||
%io:format("MAINNNNN!~n", []),
|
||||
eunit:test(?MODULE, [verbose]).
|
||||
%eunit:test(?MODULE).
|
||||
|
||||
|
||||
% directory containing the tests for the tokenizer
|
||||
ct_dir() ->
|
||||
zx_daemon:get_home() ++ "/ct".
|
||||
|
||||
agreement_tests_dir() ->
|
||||
ct_dir() ++ "/tokenizers_agree".
|
||||
|
||||
|
||||
% the divergences claude found between gsc tokenizer and so tokenizer
|
||||
%
|
||||
% mostly stupid corner cases like a string crossing a line boundary
|
||||
% or unterminated block comment
|
||||
%
|
||||
% divergence files: "divergence" means so_scan disagrees with
|
||||
% gsc_so_scan in one of the following ways:
|
||||
%
|
||||
% - one succeeds when the other errors
|
||||
% - disagree on success case
|
||||
%
|
||||
% making errors agree on two programs that work differently is a
|
||||
% fool's errand
|
||||
div_files() ->
|
||||
ContractsDir = agreement_tests_dir(),
|
||||
% this is the equivalent of ls
|
||||
% just has filenames, no /path/to/ prefix
|
||||
{ok, Files} = file:list_dir(ContractsDir),
|
||||
% originally i was a retard and didn't read the eunit
|
||||
% documentation, so if any one test failed, the entire test suite
|
||||
% would crash with no information regarding what happened
|
||||
%
|
||||
% so this was a hack to only run div01-div05 but not div06:
|
||||
%
|
||||
% % hack to fix one broken test at a time
|
||||
% (FileName = "div0" ++ [Digit | _]) when Digit =< $9 ->
|
||||
% FilePath = ct_dir() ++ "/" ++ FileName,
|
||||
% {true, {FileName, FilePath}}
|
||||
% (_) -> false
|
||||
%
|
||||
% Once i read the eunit docs and learned about test generators, I
|
||||
% realized I could have only the failed test chimp out. what a
|
||||
% concept.
|
||||
%
|
||||
% i also realized that printing the full filepath was a waste, so
|
||||
% instead the test should know about the FileName (foo.bar) and the
|
||||
% FilePath (/path/to/foo.bar).
|
||||
%
|
||||
% then i decided to start writing my own test contracts instead
|
||||
% of having claude do it and i rean into the issue of vim swap
|
||||
% files not lexing properly because they're not unicode
|
||||
IsDivCt =
|
||||
fun(FileName) ->
|
||||
% need to filter out vim swap files
|
||||
% originally was false-matching on ([$. | _])
|
||||
% like a man
|
||||
%
|
||||
% god this feels like putting my balls in a little tiny
|
||||
% guillotine (even the guillotine is emasculating) but
|
||||
% claude suggested this and i mean it's kind of the
|
||||
% most idiomatic and like straightforward. most
|
||||
% importantly it's declarative
|
||||
%
|
||||
% god i feel so defeated
|
||||
case filename:extension(FileName) of
|
||||
".aes" ->
|
||||
FilePath = ContractsDir ++ "/" ++ FileName,
|
||||
{true, {FileName, FilePath}};
|
||||
_ ->
|
||||
false
|
||||
end
|
||||
end,
|
||||
lists:sort(lists:filtermap(IsDivCt, Files)).
|
||||
|
||||
|
||||
%div_file_names() -> [N || {N, _} <- div_files()].
|
||||
%div_file_paths() -> [P || {_, P} <- div_files()].
|
||||
|
||||
tokstr_concat_test_() ->
|
||||
% future proofing
|
||||
ConcatTestFiles
|
||||
= lists:flatten([
|
||||
div_files()
|
||||
]),
|
||||
% exclude the contracts with like unterminated block comments
|
||||
% where they don't tokenize properly
|
||||
NonStupidFiles =
|
||||
lists:filter(
|
||||
fun
|
||||
({"div05_bcom_eof.aes", _}) -> false;
|
||||
({"div06_bcom_in_expr.aes", _}) -> false;
|
||||
({"div07_bcom_nested.aes", _}) -> false;
|
||||
({"div08_bcom_simple.aes", _}) -> false;
|
||||
({_, _}) -> true
|
||||
end,
|
||||
ConcatTestFiles
|
||||
),
|
||||
%?debugFmt("ConcatTestFiles=~p", [ConcatTestFiles]),
|
||||
{"file = sum(tokens)",
|
||||
[concat_property(Name, Path) || {Name, Path} <- NonStupidFiles]}.
|
||||
|
||||
concat_property(FileName, FilePath) ->
|
||||
%?debugFmt("concat_property(~p, _)", [FileName]),
|
||||
FileChars = gsc:very_stable_file(FilePath),
|
||||
{FileName ++ ": file = sum(tokens)",
|
||||
fun() ->
|
||||
case gsc:tokens_from_file(FileChars) of
|
||||
{ok, SfcTokens} ->
|
||||
ConcatStr = concat_token_strs(SfcTokens, []),
|
||||
?assertEqual(FileChars, ConcatStr);
|
||||
_Error ->
|
||||
ok
|
||||
end
|
||||
end}.
|
||||
|
||||
concat_token_strs([#tk{str = S} | Rest], Acc) ->
|
||||
concat_token_strs(Rest, [Acc, S]);
|
||||
concat_token_strs([], Acc) ->
|
||||
unicode:characters_to_nfc_list(Acc).
|
||||
|
||||
% underscore marks this as a test *generator*
|
||||
div_test_() ->
|
||||
% divergence
|
||||
DivFiles = div_files(),
|
||||
%?debugFmt("DivFiles=~p", [DivFiles]),
|
||||
{"claude tokenizer divergences fixed",
|
||||
[tokens_match(N, P) || {N, P} <- DivFiles]}.
|
||||
|
||||
tokens_match(FileName, FilePath) ->
|
||||
%?debugFmt("tokens_match(~p, _)", [FileName]),
|
||||
% extracting data to be tested
|
||||
% i hate this so much but lazy and this is test code so who really cares.
|
||||
SoTokens = so_tokens_from_file(FilePath),
|
||||
SfTokens = gsc:gso_tokens_from_file(FilePath),
|
||||
{FileName ++ ": tokenizers_agree",
|
||||
fun() ->
|
||||
case {SoTokens, SfTokens} of
|
||||
{{ok, So}, {ok, Sf}} -> ?assertEqual(So, Sf);
|
||||
{{error, _}, {error, _}} -> ok;
|
||||
{{ok, _}, {error, _}} -> error("so_scan succeeded and gso_scan failed");
|
||||
{{error, _}, {ok, _}} -> error("so_scan failed and gso_scan succeded")
|
||||
end
|
||||
end}.
|
||||
|
||||
% that's right, we have to enter via converting the
|
||||
% bytes in the file to a list... lol
|
||||
so_tokens_from_file(F) ->
|
||||
{ok, Bytes} = file:read_file(F),
|
||||
S = binary_to_list(Bytes),
|
||||
so_scan:scan(S).
|
||||
Reference in New Issue
Block a user