set up parsing structure
We tokenize, and then do the simplest possible recursive descent. We don't want to evaluate anything, so infix operators are out, meaning no shunting yard or tree rearranging or LR(1) shenanigans are necessary, just write the code. If we want to 'peek', just take the next token, and pass it around from that point on, until it can actually be consumed.
This commit is contained in:
parent
6f5525afcf
commit
97e32574c4
@ -22,6 +22,8 @@
|
||||
fate_to_erlang/2,
|
||||
erlang_args_to_fate/2,
|
||||
get_function_signature/2]).
|
||||
% Internal stuff that is useful for writing AACI unit tests.
|
||||
-export([annotate_type/2]).
|
||||
|
||||
%%% Types
|
||||
|
||||
|
||||
130
src/hz_sophia.erl
Normal file
130
src/hz_sophia.erl
Normal file
@ -0,0 +1,130 @@
|
||||
-module(hz_sophia).
|
||||
-vsn("0.8.2").
|
||||
-author("Jarvis Carroll <spiveehere@gmail.com>").
|
||||
-copyright("Jarvis Carroll <spiveehere@gmail.com>").
|
||||
-license("GPL-3.0-or-later").
|
||||
|
||||
-include_lib("eunit/include/eunit.hrl").
|
||||
|
||||
parse_literal(Type, String) ->
|
||||
case parse_expression(Type, {tk, 1, 1}, String) of
|
||||
{ok, {Result, NewTk, NewString}} ->
|
||||
parse_literal2(Result, NewTk, NewString);
|
||||
{error, Reason} ->
|
||||
{error, Reason}
|
||||
end.
|
||||
|
||||
parse_literal2(Result, Tk, String) ->
|
||||
% We have parsed a valid expression. Now check that the string ends.
|
||||
case next_token(Tk, String) of
|
||||
{ok, {{eof, _, _, _, _}, _, _}} ->
|
||||
{ok, Result};
|
||||
{ok, {{_, S, Row, Start, End}, _, _}} ->
|
||||
{error, {unexpected_token, S, Row, Start, End}};
|
||||
{error, Reason} ->
|
||||
{error, Reason}
|
||||
end.
|
||||
|
||||
%%% Tokenizer
|
||||
|
||||
next_token({tk, Row, Col}, []) ->
|
||||
{ok, {{eof, "", Row, Col, Col}, {tk, Row, Col}, []}};
|
||||
next_token({tk, Row, Col}, " " ++ Rest) ->
|
||||
next_token({tk, Row + 1, Col}, Rest);
|
||||
next_token({tk, Row, Col}, "\t" ++ Rest) ->
|
||||
next_token({tk, Row + 1, Col}, Rest);
|
||||
next_token(Tk, [N | _] = String) when N >= $0, N =< $9 ->
|
||||
num_token(Tk, Tk, String, []);
|
||||
next_token(Tk, [N | _] = String) when N >= $A, N =< $Z ->
|
||||
alphanum_token(Tk, Tk, String, []);
|
||||
next_token(Tk, [N | _] = String) when N >= $a, N =< $z ->
|
||||
alphanum_token(Tk, Tk, String, []);
|
||||
next_token(Tk, [$_ | _] = String) ->
|
||||
alphanum_token(Tk, Tk, String, []);
|
||||
next_token({tk, Row, Col}, [Char | _]) ->
|
||||
{error, {unknown_char, Row, Col, [Char]}}.
|
||||
|
||||
num_token(Start, {tk, Row, Col}, [N | Rest], Acc) when N >= $0, N =< $9 ->
|
||||
num_token(Start, {tk, Row + 1, Col}, Rest, [N | Acc]);
|
||||
num_token({tk, _, Start}, {tk, Row, End}, String, Acc) ->
|
||||
NumString = lists:reverse(Acc),
|
||||
Token = {integer, NumString, Row, Start, End},
|
||||
{ok, {Token, {tk, Row, End}, String}}.
|
||||
|
||||
alphanum_token(Start, {tk, Row, Col}, [C | Rest], Acc) when C >= $A, C =< $Z ->
|
||||
alphanum_token(Start, {tk, Row, Col}, Rest, [C | Acc]);
|
||||
alphanum_token(Start, {tk, Row, Col}, [C | Rest], Acc) when C >= $a, C =< $z ->
|
||||
alphanum_token(Start, {tk, Row, Col}, Rest, [C | Acc]);
|
||||
alphanum_token(Start, {tk, Row, Col}, [C | Rest], Acc) when C >= $0, C =< $9 ->
|
||||
alphanum_token(Start, {tk, Row, Col}, Rest, [C | Acc]);
|
||||
alphanum_token(Start, {tk, Row, Col}, [$_ | Rest], Acc) ->
|
||||
alphanum_token(Start, {tk, Row, Col}, Rest, [$_ | Acc]);
|
||||
alphanum_token({tk, _, Start}, {tk, Row, End}, String, Acc) ->
|
||||
AlphaString = lists:reverse(Acc),
|
||||
Token = {alphanum, AlphaString, Row, Start, End},
|
||||
{ok, {Token, {tk, Row, End}, String}}.
|
||||
|
||||
|
||||
%%% Sophia Literal Parser
|
||||
|
||||
%%% This parser is a simple recursive descent parser, written explicitly in
|
||||
%%% erlang.
|
||||
%%%
|
||||
%%% There are no infix operators in the subset we want to parse, so recursive
|
||||
%%% descent is fine with no special tricks, no shunting yard algorithm, no
|
||||
%%% parser generators, etc.
|
||||
%%%
|
||||
%%% If we were writing this in C then we might want to work iteratively with an
|
||||
%%% array of finite state machines, i.e. with a pushdown automaton, instead of
|
||||
%%% using recursion. This is a tried and true method of making fast parsers.
|
||||
%%% Recall, however, that the BEAM *is* a stack machine, written in C, so
|
||||
%%% rather than writing confusing iterative code in Erlang, to simulate a
|
||||
%%% pushdown automaton inside another simulated stack machine... we should just
|
||||
%%% write the recursive code, thus programming the BEAM to implement the
|
||||
%%% pushdown automaton that we want.
|
||||
|
||||
parse_expression(Type, Tk, String) ->
|
||||
{ok, {Token, NewTk, NewString}} = next_token(Tk, String),
|
||||
parse_expression2(Type, NewTk, NewString, Token).
|
||||
|
||||
parse_expression2(Type, Tk, String, {integer, S, Row, Start, End}) ->
|
||||
Value = list_to_integer(S),
|
||||
check_type(integer, Type, Row, Start, End, {Value, Tk, String});
|
||||
parse_expression2(_, _, _, {_, S, Row, Start, End}) ->
|
||||
{error, {unexpected_token, S, Row, Start, End}}.
|
||||
|
||||
check_type(Expected, {_, _, Expected}, _, _, _, Result) ->
|
||||
{ok, Result};
|
||||
check_type(_, {_, _, unknown_type}, _, _, _, Result) ->
|
||||
% We want it to be possible to opt out of type-checking, since FATE is
|
||||
% dynamically typed anyway.
|
||||
{ok, Result};
|
||||
check_type(Expected, {O, N, _}, Row, Start, End, _) ->
|
||||
{error, {wrong_type, O, N, Expected, Row, Start, End}}.
|
||||
|
||||
|
||||
%%% Tests
|
||||
|
||||
check_sophia_to_fate(Type, Sophia, Fate) ->
|
||||
{ok, FateActual} = parse_literal(Type, Sophia),
|
||||
case FateActual of
|
||||
Fate ->
|
||||
ok;
|
||||
_ ->
|
||||
erlang:error({to_fate_failed, Fate, FateActual})
|
||||
end.
|
||||
|
||||
check_parser(Type, Sophia, Fate) ->
|
||||
UnknownType = setelement(3, Type, unknown_type),
|
||||
check_sophia_to_fate(Type, Sophia, Fate),
|
||||
check_sophia_to_fate(UnknownType, Sophia, Fate),
|
||||
|
||||
% Finally, check that the FATE result is something that gmb understands.
|
||||
gmb_fate_encoding:serialize(Fate),
|
||||
|
||||
ok.
|
||||
|
||||
int_test() ->
|
||||
{ok, Type} = hz_aaci:annotate_type(integer, #{}),
|
||||
check_parser(Type, "123", 123).
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user