wip name cleanups

2026-06-01 18:00:37 -07:00
parent f548c7d88d
commit 9da6dbf18d
12 changed files with 804 additions and 281 deletions
@@ -0,0 +1,199 @@
+% @doc
+% File ::= Block(TopDecl)
+-record(ast_file,
+        {top_decls = none :: none | [top_decl()]}).
+
+
+-type ast() :: #ast_file{}
+             | top_decl()
+             | #ast_nyi{}
+             .
+
+%%   Decl ::= 'type'     Id ['(' TVar* ')'] '=' TypeAlias
+%%          | 'record'   Id ['(' TVar* ')'] '=' RecordType
+%%          | 'datatype' Id ['(' TVar* ')'] '=' DataType
+%%          | 'let'      Id [':' Type]      '=' Expr
+%%          | (EModifier* 'entrypoint' | FModifier* 'function') Block(FunDecl)
+%%          | Using
+%-record(ast_type_alias,
+%        {name        = none :: none | string(),
+%         tvars       = none :: none | [string()],
+%         rewrites_to = none :: none | ast_type()}).
+
+
+-type gulp_target()
+    :: ast_file
+     | top_decl
+     | ast_ct
+     | ast_nyi
+     .
+
+% gulp means it must consume all input
+-spec gulp(AstTarget, SigTokens) -> Perhaps
+    when AstTarget :: gulp_target(),
+         SigTokens :: [sfc_token()],
+         Perhaps   :: {gulp, ast()}
+                    | {error, sfc_err()}.
+
+gulp(ast_file, Tokens) ->
+    gulp_file(Tokens);
+gulp(top_decl, Tokens) ->
+    Targets = [ast_ct,
+               ast_nyi],
+    gulp_oneof(Targets, Tokens);
+gulp(ast_ct, Tokens) ->
+    gulp_ct(#ast_ct{}, Tokens);
+gulp(ast_nyi, Tokens) ->
+    {gulp, #ast_nyi{tokens = Tokens}};
+gulp({block_of, X}, Tokens) ->
+    {barf, ItemChunks, []} = sfc_token_chunks:barf(block_as_items, Tokens),
+    gulp_block_of(X, ItemChunks);
+gulp(Nyi, Tokens) ->
+    Msg = io_lib:format("sfc_ast:gulp/2: unknown target: ~p", [Nyi]),
+    Err = #sfc_err{atom   = gulp_nyi,
+                   string = Msg,
+                   extra = [{target, Nyi}, {tokens, Tokens}]},
+    {error, Err}.
+
+
+
+% FIXME: payable and main need to be in that order i think
+gulp_ct(Ast = #ast_ct{payable = none}, Tokens) ->
+    case Tokens of
+        [#sfc_token{string = "payable", type = kwd} | NewTokens] ->
+            gulp_ct(Ast#ast_ct{payable = payable}, NewTokens);
+        _ ->
+            gulp_ct(Ast#ast_ct{payable = false}, Tokens)
+    end;
+gulp_ct(Ast = #ast_ct{main = none}, Tokens) ->
+    case Tokens of
+        [#sfc_token{string = "main", type = kwd} | NewTokens] ->
+            gulp_ct(Ast#ast_ct{main = main}, NewTokens);
+        _ ->
+            gulp_ct(Ast#ast_ct{main = false}, Tokens)
+    end;
+gulp_ct(Ast = #ast_ct{contract = none}, Tokens) ->
+    case Tokens of
+        [#sfc_token{string = "contract", type = kwd} | NewTokens] ->
+            gulp_ct(Ast#ast_ct{contract = contract}, NewTokens);
+       % FIXME: reject logic applies to choice of branch, therefore
+       % should be contained in branchpoint code
+        _ ->
+            reject
+        %[#sfc_token{pos = P, string = S} | _] ->
+        %   {error, #sfc_err{atom = no_kwd_contract,
+        %                     extra = [{pos, P},
+        %                              {expecting, "contract"},
+        %                              {got, S},
+        %                              {ast, Ast},
+        %                              {tokens, Tokens}]}};
+        %[] ->
+        %   {error, #sfc_err{atom = no_kwd_contract,
+        %                     extra = [{pos, none},
+        %                              {expecting, "contract"},
+        %                              {got, eof},
+        %                              {ast, Ast},
+        %                              {tokens, Tokens}]}}
+    end;
+gulp_ct(Ast = #ast_ct{name = none}, Tokens) ->
+    case Tokens of
+        [#sfc_token{string = Name, type = con} | NewTokens] ->
+            gulp_ct(Ast#ast_ct{name = Name}, NewTokens);
+        _ ->
+            reject
+    end;
+gulp_ct(Ast = #ast_ct{implements = none}, Tokens) ->
+    case slurp_ct_impls(Tokens) of
+        {slurp, Names, NewTokens} ->
+            gulp_ct(Ast#ast_ct{implements = {':', Names}}, NewTokens);
+        reject ->
+            gulp_ct(Ast#ast_ct{implements = {':', []}}, Tokens);
+        Poison -> Poison
+    end;
+gulp_ct(Ast = #ast_ct{eq = none}, Tokens) ->
+    case Tokens of
+        [#sfc_token{string = "=", type = op} | NewTokens] ->
+            gulp_ct(Ast#ast_ct{eq = '='}, NewTokens);
+        _ ->
+            {error, #sfc_err{atom = no_eq}}
+    end;
+gulp_ct(Ast = #ast_ct{decls = none}, Tokens) ->
+    Decls = [gulp(decl, Item) || Item <- sfc_token_chunks:unsafe_block_to_items(Tokens)],
+    {gulp, Ast#ast_ct{decls = Decls}};
+gulp_ct(_, _) ->
+    reject.
+
+
+slurp_ct_impls([#sfc_token{string = ":", type = op},
+                #sfc_token{string = Con1, type = con}
+                | Rest]) ->
+    slurp_ct_impls2(Rest, [Con1]);
+slurp_ct_impls(_) ->
+    reject.
+
+slurp_ct_impls2([#sfc_token{string = ",", type = punct},
+                 #sfc_token{string = Con1, type = con}
+                | Rest],
+                Acc) ->
+    slurp_ct_impls2(Rest, [Con1 | Acc]);
+slurp_ct_impls2(Rest, Names) ->
+    {slurp, lists:reverse(Names), Rest}.
+
+
+
+
+
+-spec gulp_file(SigTokens) -> Perhaps
+    when SigTokens :: [sfc_token()],
+         Perhaps   :: {gulp, #ast_file{}}
+                    | {error, sfc_err()}.
+% @private
+% `file` enforces that the entire SigTokens is one
+% block, chokes otherwise
+
+gulp_file([]) ->
+    {error, #sfc_err{atom = empty_file}};
+gulp_file(FileTokens = [#sfc_token{pos = FilePos} | _]) ->
+    case sfc_token_chunks:barf(block, FileTokens) of
+        % happy path: got the whole file back
+        {barf, FileTokens, []} ->
+            gulp_full_file(FileTokens);
+        % sad path: block terminated
+        {barf, _, [#sfc_token{pos = EndPos}]} ->
+            Msg = io_lib:format("block starting at ~p ends at ~p instead of EOF",
+                                [FilePos, EndPos]),
+            {error, #sfc_err{atom   = bad_file,
+                             string = Msg}};
+        Nyi ->
+            {error, #sfc_err{atom = bad_file_nyi, extra = Nyi}}
+    end.
+
+
+% FIXME: need to rethink types here in order to handle syntax errors
+% from different blocks independently.
+
+% file = block(top_decl)
+gulp_full_file(BlockTokens) ->
+    ItemChunks = sfc_token_chunks:unsafe_block_to_items(BlockTokens),
+    gulp_file_decls([], [], ItemChunks).
+
+
+gulp_file_decls(Decls, Errs, [DeclTokens | Rest]) ->
+    case gulp(top_decl, DeclTokens) of
+        {gulp, NewDecl} ->
+            gulp_file_decls([NewDecl | Decls], Errs, Rest);
+        reject ->
+            ErrPos = sfc_token_chunks:start_pos(DeclTokens),
+            NewErr = #sfc_err{atom = bad_top_decl,
+                              extra = [{tokens, DeclTokens},
+                                       {pos, ErrPos}]},
+            gulp_file_decls(Decls, [NewErr | Errs], Rest);
+        Poison ->
+            gulp_file_decls(Decls, [Poison | Errs], Rest)
+    end;
+% end of block
+gulp_file_decls(Decls, _Errs = [], _Input = []) ->
+    {gulp, #ast_file{top_decls = lists:reverse(Decls)}};
+gulp_file_decls(_Decls, Errs, _Input = []) ->
+    {error, #sfc_err{atom = many,
+                      extra = Errs}}.
@@ -0,0 +1,59 @@
+% @doc
+% working out infix parsing bullshit on toy arith language
+%
+% our operators for now are
+%
+% [+, *, ^] in outer->inner order
+-module(ifarith).
+
+-export([main/0]).
+
+test_str() ->
+    "1 + 2 + 3"
+
+main() ->
+    % first going to tokenize
+    Tokens = tokens(test_str()).
+
+-record(tk,
+        {type = none :: int | op | noise,
+         str  = none :: none | string(),
+         val  = none :: none | integer() | atom()}).
+
+tokens(Stk, []) ->
+    lists:reverse(Stk).
+% [+*^] op token
+tokens(Stk, [Char | Rest]) ->
+    case Char of
+        Op when $+ =:= Op; $* =:= Op; $^ =:= Op ->
+            Tk = #tk{type = op, str = [Op], val = list_to_tuple([Op])},
+            tokens([Tk | Stk], Rest);
+        D when $0 =< D, D =< $9 ->
+            {Tk, NewSrcStr} = tk_int([D], [D], Rest),
+            tokens([Tk | Stk], NewSrcStr).
+        _ ->
+            tokens(Stk, Rest)
+    end.
+
+
+% tokens for now are
+-spec tk_int(DigitStack, CharStack, SrcStr) -> Result when
+        DigitStack  :: string(),
+        CharStack   :: string(),
+        SrcStr      :: string(),
+        Result      :: {Token, NewSrcStr},
+        Token       :: #tk{},
+        NewSrcStr   :: string().
+
+tk_int(DigitStack, CharStack, SrcStr) ->
+    case SrcStr of
+        % cases when still consuming the int
+        % [0-9]
+        [D | NewSrcStr] when $0 =< D, D =< $9 ->
+            tk_int([D | DigitStack], [D | CharStack], NewSrcStr);
+        [$_ | NewSrcStr] ->
+            tk_int(DigitStack, [D | CharStack], NewSrcStr);
+        % otherwise done
+        _ ->
+            Digits = 
+    end.
@@ -0,0 +1,88 @@
+-type ifx_tree_() :: any().
+
+%% placeholders
+-type ast_() :: any().
+-type ast_type_expr_() :: any().
+-type ast_te_() :: any().
+
+
+% @doc
+% product type: foo * bar * baz
+%
+% stupid weird implication from bad syntax foresight trying to be
+% fancy and overload what parens do is products must always have at
+% least two operands; probably this is because `(foo)` is always the
+% same as `foo`
+%
+% - 0-tuple -> `unit`
+% - 1-tuple -> type itself
+% - 2+      -> here
+-record(ast_te_prod2,
+        {types = none :: none | [ast_te_()]}).
+
+
+
+% @doc
+% function type: (string, string) => string
+-record(ast_te_ts_to_t,
+        {dom   = none :: none | [ast_te_()],
+         codom = none :: none | ast_te_()}).
+
+
+% @doc
+% application type: map(string, int)
+-record(ast_te_t_of_ts,
+        {fn    = none :: none | ast_te_(),
+         args  = none :: none | [ast_te_()]}).
+
+
+% @doc
+% node for a type name
+% token type id   : string int unit
+%            qid  : Foo.Bar.baz
+%            tvar : 'a
+-record(ast_te_name,
+        {name = none :: none | sfc_token()}).
+
+
+% @doc
+% placeholder
+-record(ast_te_nyi,
+        {tokens = none :: none | [sfc_token()]}).
+-type ast_te_nyi() :: #ast_nyi{}.
+
+
+-type ast_type_expr()
+    :: #ast_te_ts_to_t{}   % function    (string, string) => string
+     | #ast_te_t_of_ts{}   % application map(string, int)
+     | #ast_te_prod2{}     % product     foo * bar * baz
+     | #ast_te_tkid{}      % token       string int 'a Foo.Bar.baz
+     | #ast_te_nyi{}.
+-type ast_te() :: ast_type_expr().
+
+
+-record(ifx_stem_op,
+        {left     = none :: none | [ifx_tree_()],
+         op       = none :: none | infix_op(),
+         op_token = none :: none | {value, sfc_token()},
+         right    = none :: none | [ifx_tree_()]}).
+
+-record(ifx_stem_plist,
+        {items :: [any()]}).
+
+-record(ifx_leaf_idtk,
+        {token :: sfc_token()}).
+
+-type ifx_tree()
+        :: #ifx_stem_op{}
+         | #ifx_stem_plist{}
+         | #ifx_leaf_idtk{}.
+
+
+-spec slurp_ifx_tree(Tokens) -> SlurpedIfxTree when
+        Tokens         :: [sfc_token()],
+        SlurpedIfxNode :: slurped(ifx_tree()).
+
+slurp_ifx_tree(Tokens) ->
+    case take_until_ifx_op(Tokens) of
+        
@@ -0,0 +1,193 @@
+% @doc
+% helper functions for grabbing collections of tokens
+% off the token stream
+%
+% generally assume no whitespace/comment tokens in
+% input stream
+-module(sfc_token_chunks).
+
+%-export_type([
+%    chunk_shape/0,
+%    choke_reason/0
+%]).
+%
+%-export([
+%    take/2,
+%    unsafe_block_to_items/1,
+%    barf/2,
+%    start_pos/1,
+%    end_pos/1
+%]).
+%
+%% $sfc_include is so c() works from sfp eshell
+%-include("$sfc_include/sfc.hrl").
+%
+%%------------------------------------------
+%% Types
+%%------------------------------------------
+%
+%-type chunk_shape()
+%    :: block
+%     | block_item
+%     | {block_item, Level :: pos_integer()}
+%     | block_as_items
+%     .
+%
+%% FIXME
+%-type choke_reason() :: any().
+%
+%
+%%------------------------------------------
+%% functions
+%%------------------------------------------
+%
+%% take = just split
+%
+%take(block, []) ->
+%    {[], []};
+%take(block, [Hd = #sfc_token{pos = {_, BCol}} | Tl]) ->
+%    tw(fun(#sfc_token{pos = {_, TkCol}}) -> BCol =< TkCol end, [Hd], Tl);
+%take(block_item, []) ->
+%    {[], []};
+%take(block_item, [Hd = #sfc_token{pos = {_, ICol}} | Tl]) ->
+%    tw(fun(#sfc_token{pos = {_, TkCol}}) -> ICol < TkCol end, Tl).
+%
+%
+%
+%-spec start_pos([sfc_token()]) -> {value, sfc_pos()} | none.
+%
+%start_pos([#sfc_token{pos = P}]) -> {value, P};
+%start_pos([])                    -> none.
+%
+%
+%-spec end_pos([sfc_token()]) -> {value, sfc_pos()} | none.
+%
+%end_pos([#sfc_token{pos = Pos, string = Str}]) ->
+%    {value, sfc_tokens:new_pos(Pos, Str)};
+%end_pos([_ | T]) ->
+%    end_pos(T);
+%end_pos([]) ->
+%    none.
+%
+%
+%-spec barf(ChunkShape, SigTokens) -> Perhaps
+%    when ChunkShape :: chunk_shape(),
+%         SigTokens  :: [Token],
+%         Perhaps :: {barf, Chunk, Rest}
+%                  | {choke, Reason},
+%         Chunk  :: [Token]      % most
+%                 | [[Token]],   % block_as_items
+%         Rest   :: [Token],
+%         Reason :: choke_reason(),
+%         Token :: sfc_token().
+%
+%% @doc
+%% slurp/barf terminology comes from paredit mode in
+%% emacs
+%%
+%% slurp ~= accepting input
+%% barf ~= separating input
+%%
+%%   slurp: (foo bar) baz ~> (foo bar baz)
+%%   barf : (foo bar baz) ~> foo (bar baz)
+%
+%barf(_, []) ->
+%    {barf, [], []};
+%barf(block, [H = #sfc_token{pos = {_, BlkCol}} | T]) ->
+%    Take =
+%        fun(#sfc_token{pos = {_, TkCol}}) ->
+%            BlkCol =< TkCol
+%        end,
+%    {A, B} = tw(Take, T),
+%    {barf, [H | A], B};
+%barf(block_item, [H = #sfc_token{pos = {_, BlkCol}} | T]) ->
+%    Take =
+%        fun(#sfc_token{pos = {_, TkCol}}) ->
+%            BlkCol < TkCol
+%        end,
+%    {A, B} = tw(Take, T),
+%    {barf, [H | A], B};
+%% not needed for our case, future-proofing. see unsafe_block_to_items
+%% for details
+%barf({block_item, Level}, Tokens = [#sfc_token{pos = {_, StartLevel}} | _]) ->
+%    case Level =:= StartLevel of
+%        false -> {barf, [], Tokens};
+%        true  -> barf(block_item, Tokens)
+%    end;
+%% this has a fancy name in Haskell like Lens . lift ^. mapM_
+%%
+%% i think it's `sequence` actually, but not looking it up
+%%
+%% this barfs a block, and then uses unsafe_block_to_items/1 to split
+%% the block tokens into individual items
+%barf(block_as_items, Tokens) ->
+%    {barf, BlockTokens, Rest} = barf(block, Tokens),
+%    {barf, unsafe_block_to_items(BlockTokens), Rest};
+%barf(_, _) ->
+%    {choke, #sfc_err_nyi{}}.
+%
+%
+%
+%
+%
+%
+%-spec unsafe_block_to_items([Token]) -> [[Token]]
+%    when Token :: sfc_token().
+%
+%% @doc
+%% PITFALL: this ASSUMES that the given list of tokens has the
+%% property that all indent levels are >= that of the head... i.e. the
+%% input to this is assumed to be the output of (e.g.) barf(block, _)
+%%
+%% the danger case is something my intuition is pointing to as a
+%% possibility perhaps if you're doing some incremental parallel
+%% stream parsing voodoo, naively parsing a block by greedily pulling
+%% block items off the head of the list
+%%
+%% with the current way things work, we actually do not need to check
+%% the indent level of each block item and make sure they're all the
+%% same
+%%
+%%   BLOCK =
+%%       foo
+%%           ...
+%%       bar
+%%           ...
+%%       baz
+%%           ...
+%%
+%%   BLOCK_ITEM =
+%%       foo
+%%           ...
+%%
+%%
+%% very important property of blocks is that each list item starts at
+%% the same indent level.
+%%
+%% a concern would be that when we go to grab the bar item that
+%% BarIndentLevel is somehow different from FooIndentLevel.
+%%
+%% let us reason through why it must be the case that FooIndentLevel
+%% =:= BarIndentLevel
+%%
+%% 1. not (BarIndentLevel < FooIndentLevel); i.e.
+%%
+%%       // impossible by call path:
+%%          foo ...
+%%       bar ...
+%%
+%%    This is impossible because the call path ensures that all tokens
+%%    in BlkItems have indent level >= FooIndentLevel
+%%
+%% 2. not (FooIndentLevel < BarIndentLevel),
+%%
+%%       // impossible because bar would get
+%%       // consumed by the foo block
+%%       foo ...
+%%           bar ...
+%
+%unsafe_block_to_items([]) ->
+%    [];
+%unsafe_block_to_items(BlockTks) ->
+%    {barf, ItemTks, NewBlockTks} = barf(block_item, BlockTks),
+%    [ItemTks | unsafe_block_to_items(NewBlockTks)].