Map parsing

List parsing
Slowly chipping away at cases...
2026-01-16 05:46:27 +00:00 · 2026-01-15 09:38:04 +00:00 · 2026-01-15 01:52:30 +00:00 · 2026-01-15 01:50:50 +00:00 · 2026-01-13 01:19:29 +00:00 · 2026-01-09 04:39:58 +00:00
3 changed files with 1460 additions and 1085 deletions
--- a/src/hz.erl
+++ b/src/hz.erl
--- a/src/hz_aaci.erl
+++ b/src/hz_aaci.erl
--- a/src/hz_sophia.erl
+++ b/src/hz_sophia.erl
@ -0,0 +1,267 @@
 -module(hz_sophia).
 -vsn("0.8.2").
 -author("Jarvis Carroll <spiveehere@gmail.com>").
 -copyright("Jarvis Carroll <spiveehere@gmail.com>").
 -license("GPL-3.0-or-later").
 -include_lib("eunit/include/eunit.hrl").
 parse_literal(Type, String) ->
    case parse_expression(Type, {tk, 1, 1}, String) of
        {ok, {Result, NewTk, NewString}} ->
            parse_literal2(Result, NewTk, NewString);
        {error, Reason} ->
            {error, Reason}
    end.
 parse_literal2(Result, Tk, String) ->
    % We have parsed a valid expression. Now check that the string ends.
    case next_token(Tk, String) of
        {ok, {{eof, _, _, _, _}, _, _}} ->
            {ok, Result};
        {ok, {{_, S, Row, Start, End}, _, _}} ->
            {error, {unexpected_token, S, Row, Start, End}};
        {error, Reason} ->
            {error, Reason}
    end.
 %%% Tokenizer
 next_token({tk, Row, Col}, []) ->
    {ok, {{eof, "", Row, Col, Col}, {tk, Row, Col}, []}};
 next_token({tk, Row, Col}, " " ++ Rest) ->
    next_token({tk, Row + 1, Col}, Rest);
 next_token({tk, Row, Col}, "\t" ++ Rest) ->
    next_token({tk, Row + 1, Col}, Rest);
 next_token(Tk, [N | _] = String) when N >= $0, N =< $9 ->
    num_token(Tk, Tk, String, []);
 next_token(Tk, [N | _] = String) when N >= $A, N =< $Z ->
    alphanum_token(Tk, Tk, String, []);
 next_token(Tk, [N | _] = String) when N >= $a, N =< $z ->
    alphanum_token(Tk, Tk, String, []);
 next_token(Tk, [$_ | _] = String) ->
    alphanum_token(Tk, Tk, String, []);
 next_token({tk, Row, Col}, [Char | Rest]) ->
    Token = {character, [Char], Row, Col, Col},
    {ok, {Token, {tk, Row + 1, Col}, Rest}}.
 num_token(Start, {tk, Row, Col}, [N | Rest], Acc) when N >= $0, N =< $9 ->
    num_token(Start, {tk, Row + 1, Col}, Rest, [N | Acc]);
 num_token({tk, _, Start}, {tk, Row, End}, String, Acc) ->
    NumString = lists:reverse(Acc),
    Token = {integer, NumString, Row, Start, End},
    {ok, {Token, {tk, Row, End}, String}}.
 alphanum_token(Start, {tk, Row, Col}, [C | Rest], Acc) when C >= $A, C =< $Z ->
    alphanum_token(Start, {tk, Row, Col}, Rest, [C | Acc]);
 alphanum_token(Start, {tk, Row, Col}, [C | Rest], Acc) when C >= $a, C =< $z ->
    alphanum_token(Start, {tk, Row, Col}, Rest, [C | Acc]);
 alphanum_token(Start, {tk, Row, Col}, [C | Rest], Acc) when C >= $0, C =< $9 ->
    alphanum_token(Start, {tk, Row, Col}, Rest, [C | Acc]);
 alphanum_token(Start, {tk, Row, Col}, [$_ | Rest], Acc) ->
    alphanum_token(Start, {tk, Row, Col}, Rest, [$_ | Acc]);
 alphanum_token({tk, _, Start}, {tk, Row, End}, String, Acc) ->
    AlphaString = lists:reverse(Acc),
    Token = {alphanum, AlphaString, Row, Start, End},
    {ok, {Token, {tk, Row, End}, String}}.
 %%% Sophia Literal Parser
 %%% This parser is a simple recursive descent parser, written explicitly in
 %%% erlang.
 %%%
 %%% There are no infix operators in the subset we want to parse, so recursive
 %%% descent is fine with no special tricks, no shunting yard algorithm, no
 %%% parser generators, etc.
 %%%
 %%% If we were writing this in C then we might want to work iteratively with an
 %%% array of finite state machines, i.e. with a pushdown automaton, instead of
 %%% using recursion. This is a tried and true method of making fast parsers.
 %%% Recall, however, that the BEAM *is* a stack machine, written in C, so
 %%% rather than writing confusing iterative code in Erlang, to simulate a
 %%% pushdown automaton inside another simulated stack machine... we should just
 %%% write the recursive code, thus programming the BEAM to implement the
 %%% pushdown automaton that we want.
 parse_expression(Type, Tk, String) ->
    {ok, {Token, NewTk, NewString}} = next_token(Tk, String),
    parse_expression2(Type, NewTk, NewString, Token).
 parse_expression2(Type, Tk, String, {integer, S, Row, Start, End}) ->
    Value = list_to_integer(S),
    case Type of
        {_, _, integer} ->
            {ok, {Value, Tk, String}};
        {_, _, unknown_type} ->
            {ok, {Value, Tk, String}};
        {O, N, _} ->
            {error, {wrong_type, O, N, integer, Row, Start, End}}
    end;
 parse_expression2(Type, Tk, String, {character, "[", Row, Start, _}) ->
    parse_list(Type, Tk, String, Row, Start);
 parse_expression2(Type, Tk, String, {character, "{", Row, Start, _}) ->
    parse_record_or_map(Type, Tk, String, Row, Start);
 parse_expression2(_, _, _, {_, S, Row, Start, End}) ->
    {error, {unexpected_token, S, Row, Start, End}}.
 unknown_type() ->
    {unknown_type, already_normalized, unknown_type}.
 expect_tokens([], Tk, String) ->
    {ok, {Tk, String}};
 expect_tokens([Str | Rest], Tk, String) ->
    case next_token(Tk, String) of
        {ok, {{_, Str, _, _, _}, NewTk, NewString}} ->
            expect_tokens(Rest, NewTk, NewString);
        {ok, {{_, Actual, Row, Start, End}}} ->
            {error, {unexpected_token, Actual, Row, Start, End}}
    end.
 %%% List Parsing
 parse_list({_, _, {list, [Inner]}}, Tk, String, Row, Start) ->
    parse_list_loop(Inner, Tk, String, Row, Start, []);
 parse_list({_, _, unknown_type}, Tk, String, Row, Start) ->
    parse_list_loop(unknown_type(), Tk, String, Row, Start, []);
 parse_list({O, N, _}, _, _, Row, Start) ->
    {error, {wrong_type, O, N, list, Row, Start, Start}}.
 parse_list_loop(Inner, Tk, String, Row, Start, Acc) ->
    case next_token(Tk, String) of
        {ok, {{character, "]", _, _, _}, NewTk, NewString}} ->
            {ok, {lists:reverse(Acc), NewTk, NewString}};
        {ok, {Token, NewTk, NewString}} ->
            parse_list_loop2(Inner, NewTk, NewString, Row, Start, Acc, Token)
    end.
 parse_list_loop2(Inner, Tk, String, Row, Start, Acc, Token) ->
    case parse_expression2(Inner, Tk, String, Token) of
        {ok, {Value, NewTk, NewString}} ->
            parse_list_loop3(Inner, NewTk, NewString, Row, Start, [Value | Acc]);
        {error, Reason} ->
            Wrapped = wrap_error(Reason, {list_element, length(Acc)}),
            {error, Wrapped}
    end.
 parse_list_loop3(Inner, Tk, String, Row, Start, Acc) ->
    case next_token(Tk, String) of
        {ok, {{character, "]", _, _, _}, NewTk, NewString}} ->
            {ok, {lists:reverse(Acc), NewTk, NewString}};
        {ok, {{character, ",", _, _, _}, NewTk, NewString}} ->
            parse_list_loop(Inner, NewTk, NewString, Row, Start, Acc);
        {error, Reason} ->
            {error, Reason}
    end.
 %%% Record parsing
 parse_record_or_map({_, _, {map, [KeyType, ValueType]}}, Tk, String, _, _) ->
    parse_map(KeyType, ValueType, Tk, String, #{});
 parse_record_or_map({_, _, {record, Fields}}, Tk, String, _, _) ->
    parse_record(Fields, Tk, String);
 parse_record_or_map({_, _, unknown_type}, Tk, String, _, _) ->
    case next_token(Tk, String) of
        {ok, {{character, "}", _, _, _}, NewTk, NewString}} ->
            {ok, {#{}, NewTk, NewString}};
        {ok, {{character, "[", _, _, _}, NewTk, NewString}} ->
            parse_map2(unknown_type(), unknown_type(), NewTk, NewString, #{});
        {ok, {{alphanum, _, Row, Start, End}, _, _}} ->
            {error, {unresolved_record, Row, Start, End}};
        {ok, {{_, S, Row, Start, End}, _, _}} ->
            {error, {unexpected_token, S, Row, Start, End}}
    end;
 parse_record_or_map({O, N, _}, _, _, Row, Start) ->
    {error, {wrong_type, O, N, map, Row, Start, Start}}.
 parse_record(Fields, Tk, String) ->
    {error, not_yet_implemented}.
 %%% Map Parsing
 parse_map(KeyType, ValueType, Tk, String, Acc) ->
    case next_token(Tk, String) of
        {ok, {{character, "[", _, _, _}, NewTk, NewString}} ->
            parse_map2(KeyType, ValueType, NewTk, NewString, Acc);
        {ok, {{character, "}", _, _, _}, NewTk, NewString}} ->
            {ok, {Acc, NewTk, NewString}};
        {ok, {{_, S, Row, Start, End}}} ->
            {error, {unexpected_token, S, Row, Start, End}}
    end.
 parse_map2(KeyType, ValueType, Tk, String, Acc) ->
    case parse_expression(KeyType, Tk, String) of
        {ok, {Result, NewTk, NewString}} ->
            parse_map3(KeyType, ValueType, NewTk, NewString, Acc, Result);
        {error, Reason} ->
            wrap_error(Reason, {map_key, maps:size(Acc)})
    end.
 parse_map3(KeyType, ValueType, Tk, String, Acc, Key) ->
    case expect_tokens(["]", "="], Tk, String) of
        {ok, {NewTk, NewString}} ->
            parse_map4(KeyType, ValueType, NewTk, NewString, Acc, Key);
        {error, Reason} ->
            {error, Reason}
    end.
 parse_map4(KeyType, ValueType, Tk, String, Acc, Key) ->
    case parse_expression(ValueType, Tk, String) of
        {ok, {Result, NewTk, NewString}} ->
            NewAcc = maps:put(Key, Result, Acc),
            parse_map5(KeyType, ValueType, NewTk, NewString, NewAcc);
        {error, Reason} ->
            {error, Reason}
    end.
 parse_map5(KeyType, ValueType, Tk, String, Acc) ->
    case next_token(Tk, String) of
        {ok, {{character, ",", _, _, _}, NewTk, NewString}} ->
            parse_map(KeyType, ValueType, NewTk, NewString, Acc);
        {ok, {{character, "}", _, _, _}, NewTk, NewString}} ->
            {ok, {Acc, NewTk, NewString}};
        {ok, {{_, S, Row, Start, End}}} ->
            {error, {unexpected_token, S, Row, Start, End}}
    end.
 % TODO
 wrap_error(Reason, _) -> Reason.
 %%% Tests
 check_sophia_to_fate(Type, Sophia, Fate) ->
    {ok, FateActual} = parse_literal(Type, Sophia),
    case FateActual of
        Fate ->
            ok;
        _ ->
            erlang:error({to_fate_failed, Fate, FateActual})
    end.
 check_parser(Type, Sophia, Fate) ->
    check_sophia_to_fate(Type, Sophia, Fate),
    check_sophia_to_fate(unknown_type(), Sophia, Fate),
    % Finally, check that the FATE result is something that gmb understands.
    gmb_fate_encoding:serialize(Fate),
    ok.
 check_parser(Sophia, Fate) ->
    Source = "contract C = entrypoint f() = " ++ Sophia,
    {ok, AACI} = hz_aaci:aaci_from_string(Source),
    {ok, {_, Type}} = hz_aaci:get_function_signature(AACI, "f"),
    check_parser(Type, Sophia, Fate).
 int_test() ->
    check_parser("123", 123).
 list_test() ->
    check_parser("[1, 2, 3]", [1, 2, 3]).
 list_of_lists_test() ->
    check_parser("[[], [1], [2, 3]]", [[], [1], [2, 3]]).
 maps_test() ->
    check_parser("{[1] = 2, [3] = 4}", #{1 => 2, 3 => 4}).
Author	SHA1	Message	Date
Jarvis Carroll	56e63051bc	Map parsing	2026-01-16 05:46:27 +00:00
Jarvis Carroll	3f1c9bd626	List parsing Slowly chipping away at cases...	2026-01-15 09:38:04 +00:00
Jarvis Carroll	97e32574c4	set up parsing structure We tokenize, and then do the simplest possible recursive descent. We don't want to evaluate anything, so infix operators are out, meaning no shunting yard or tree rearranging or LR(1) shenanigans are necessary, just write the code. If we want to 'peek', just take the next token, and pass it around from that point on, until it can actually be consumed.	2026-01-15 01:52:30 +00:00
Jarvis Carroll	6f5525afcf	Rename get_function_signature hz_aaci:aaci_get_function_signature is a bit redundant.	2026-01-15 01:50:50 +00:00
Jarvis Carroll	4f1958b210	use lists:unzip/1 Just a little thing I noticed could be improved.	2026-01-13 01:19:29 +00:00
Jarvis Carroll	3da9bd570b	split coerce/3 into two functions Also renamed coerce_bindings to erlang_args_to_fate, to match.	2026-01-09 04:39:58 +00:00
Jarvis Carroll	d2163c1ff8	split AACI out of hz.erl So far the interface to hz.erl is mostly unchanged, apart from prepare_aaci/1 Maybe prepare_aaci should be re-exported, but using it is exactly in line with the 'inconvenient but more flexible primitives' that hz_aaci.erl is meant to represent, so, maybe that is a fine place to have to go for it, dunno.	2026-01-07 09:40:55 +00:00