diff --git a/src/hz_aaci.erl b/src/hz_aaci.erl index f9da33f..8fda712 100644 --- a/src/hz_aaci.erl +++ b/src/hz_aaci.erl @@ -22,6 +22,8 @@ fate_to_erlang/2, erlang_args_to_fate/2, get_function_signature/2]). +% Internal stuff that is useful for writing AACI unit tests. +-export([annotate_type/2]). %%% Types diff --git a/src/hz_sophia.erl b/src/hz_sophia.erl new file mode 100644 index 0000000..b428616 --- /dev/null +++ b/src/hz_sophia.erl @@ -0,0 +1,130 @@ +-module(hz_sophia). +-vsn("0.8.2"). +-author("Jarvis Carroll "). +-copyright("Jarvis Carroll "). +-license("GPL-3.0-or-later"). + +-include_lib("eunit/include/eunit.hrl"). + +parse_literal(Type, String) -> + case parse_expression(Type, {tk, 1, 1}, String) of + {ok, {Result, NewTk, NewString}} -> + parse_literal2(Result, NewTk, NewString); + {error, Reason} -> + {error, Reason} + end. + +parse_literal2(Result, Tk, String) -> + % We have parsed a valid expression. Now check that the string ends. + case next_token(Tk, String) of + {ok, {{eof, _, _, _, _}, _, _}} -> + {ok, Result}; + {ok, {{_, S, Row, Start, End}, _, _}} -> + {error, {unexpected_token, S, Row, Start, End}}; + {error, Reason} -> + {error, Reason} + end. + +%%% Tokenizer + +next_token({tk, Row, Col}, []) -> + {ok, {{eof, "", Row, Col, Col}, {tk, Row, Col}, []}}; +next_token({tk, Row, Col}, " " ++ Rest) -> + next_token({tk, Row + 1, Col}, Rest); +next_token({tk, Row, Col}, "\t" ++ Rest) -> + next_token({tk, Row + 1, Col}, Rest); +next_token(Tk, [N | _] = String) when N >= $0, N =< $9 -> + num_token(Tk, Tk, String, []); +next_token(Tk, [N | _] = String) when N >= $A, N =< $Z -> + alphanum_token(Tk, Tk, String, []); +next_token(Tk, [N | _] = String) when N >= $a, N =< $z -> + alphanum_token(Tk, Tk, String, []); +next_token(Tk, [$_ | _] = String) -> + alphanum_token(Tk, Tk, String, []); +next_token({tk, Row, Col}, [Char | _]) -> + {error, {unknown_char, Row, Col, [Char]}}. + +num_token(Start, {tk, Row, Col}, [N | Rest], Acc) when N >= $0, N =< $9 -> + num_token(Start, {tk, Row + 1, Col}, Rest, [N | Acc]); +num_token({tk, _, Start}, {tk, Row, End}, String, Acc) -> + NumString = lists:reverse(Acc), + Token = {integer, NumString, Row, Start, End}, + {ok, {Token, {tk, Row, End}, String}}. + +alphanum_token(Start, {tk, Row, Col}, [C | Rest], Acc) when C >= $A, C =< $Z -> + alphanum_token(Start, {tk, Row, Col}, Rest, [C | Acc]); +alphanum_token(Start, {tk, Row, Col}, [C | Rest], Acc) when C >= $a, C =< $z -> + alphanum_token(Start, {tk, Row, Col}, Rest, [C | Acc]); +alphanum_token(Start, {tk, Row, Col}, [C | Rest], Acc) when C >= $0, C =< $9 -> + alphanum_token(Start, {tk, Row, Col}, Rest, [C | Acc]); +alphanum_token(Start, {tk, Row, Col}, [$_ | Rest], Acc) -> + alphanum_token(Start, {tk, Row, Col}, Rest, [$_ | Acc]); +alphanum_token({tk, _, Start}, {tk, Row, End}, String, Acc) -> + AlphaString = lists:reverse(Acc), + Token = {alphanum, AlphaString, Row, Start, End}, + {ok, {Token, {tk, Row, End}, String}}. + + +%%% Sophia Literal Parser + +%%% This parser is a simple recursive descent parser, written explicitly in +%%% erlang. +%%% +%%% There are no infix operators in the subset we want to parse, so recursive +%%% descent is fine with no special tricks, no shunting yard algorithm, no +%%% parser generators, etc. +%%% +%%% If we were writing this in C then we might want to work iteratively with an +%%% array of finite state machines, i.e. with a pushdown automaton, instead of +%%% using recursion. This is a tried and true method of making fast parsers. +%%% Recall, however, that the BEAM *is* a stack machine, written in C, so +%%% rather than writing confusing iterative code in Erlang, to simulate a +%%% pushdown automaton inside another simulated stack machine... we should just +%%% write the recursive code, thus programming the BEAM to implement the +%%% pushdown automaton that we want. + +parse_expression(Type, Tk, String) -> + {ok, {Token, NewTk, NewString}} = next_token(Tk, String), + parse_expression2(Type, NewTk, NewString, Token). + +parse_expression2(Type, Tk, String, {integer, S, Row, Start, End}) -> + Value = list_to_integer(S), + check_type(integer, Type, Row, Start, End, {Value, Tk, String}); +parse_expression2(_, _, _, {_, S, Row, Start, End}) -> + {error, {unexpected_token, S, Row, Start, End}}. + +check_type(Expected, {_, _, Expected}, _, _, _, Result) -> + {ok, Result}; +check_type(_, {_, _, unknown_type}, _, _, _, Result) -> + % We want it to be possible to opt out of type-checking, since FATE is + % dynamically typed anyway. + {ok, Result}; +check_type(Expected, {O, N, _}, Row, Start, End, _) -> + {error, {wrong_type, O, N, Expected, Row, Start, End}}. + + +%%% Tests + +check_sophia_to_fate(Type, Sophia, Fate) -> + {ok, FateActual} = parse_literal(Type, Sophia), + case FateActual of + Fate -> + ok; + _ -> + erlang:error({to_fate_failed, Fate, FateActual}) + end. + +check_parser(Type, Sophia, Fate) -> + UnknownType = setelement(3, Type, unknown_type), + check_sophia_to_fate(Type, Sophia, Fate), + check_sophia_to_fate(UnknownType, Sophia, Fate), + + % Finally, check that the FATE result is something that gmb understands. + gmb_fate_encoding:serialize(Fate), + + ok. + +int_test() -> + {ok, Type} = hz_aaci:annotate_type(integer, #{}), + check_parser(Type, "123", 123). +