diff --git a/README.md b/README.md new file mode 100644 index 0000000..e69de29 diff --git a/bin/gsc b/bin/gsc new file mode 100755 index 0000000..9347aa0 --- /dev/null +++ b/bin/gsc @@ -0,0 +1,45 @@ +#!/usr/bin/env bash + +# Bash harness for GSC command line tool +# +# gsc foo bar baz +# -> zx rundir \ +# /home/pharpend/src/ioecs/gsc/cli \ +# --libs=gsc:/home/pharpend/src/ioecs/gsc \ +# foo bar baz + +zx_exists() { + command -v zx > /dev/null +} + + +if ! zx_exists ; then + echo "zx not found in \$PATH" + echo "" + echo "Please install Erlang and zx:" + echo "https://git.qpq.swiss/QPQ-AG/public-wiki/wiki/Install-Erlang-and-zx" + echo "" + echo "PATH=${PATH}" + exit 1 +fi + +# script location and parent dir +FP_THIS_FILE=$(readlink -f "${BASH_SOURCE[0]}") +FP_THIS_DIR=$(dirname -- "${FP_THIS_FILE}") +FP_PRJ_DIR=$(dirname -- "${FP_THIS_DIR}") +FP_CLI_DIR="${FP_PRJ_DIR}/cli" + +# # compute libs string +# LIB_PARTS=() +# for depname in "${LOCAL_DEPS[@]}"; do +# LIB_PARTS+=("${depname}:${PRJ_DIR}/${depname}") +# done +# IFS=, +# LOCAL_LIBS="${LIB_PARTS[*]}" +# unset IFS + +LOCAL_LIBS="gsc:${FP_PRJ_DIR}" + +#echo "zx rundir $FP_CLI_DIR --libs=$LOCAL_LIBS $@" + +zx rundir $FP_CLI_DIR --libs=$LOCAL_LIBS $@ diff --git a/cli/.gitignore b/cli/.gitignore new file mode 100644 index 0000000..20177b4 --- /dev/null +++ b/cli/.gitignore @@ -0,0 +1,15 @@ +.eunit +deps +*.o +*.beam +*.plt +*.swp +erl_crash.dump +ebin/*.beam +doc/*.html +doc/*.css +doc/edoc-info +doc/erlang.png +rel/example_project +.concrete/DEV_MODE +.rebar diff --git a/cli/Emakefile b/cli/Emakefile new file mode 100644 index 0000000..68c7b67 --- /dev/null +++ b/cli/Emakefile @@ -0,0 +1 @@ +{"src/*", [debug_info, {i, "include/"}, {outdir, "ebin/"}]}. diff --git a/cli/ebin/gsc_cli.app b/cli/ebin/gsc_cli.app new file mode 100644 index 0000000..b409547 --- /dev/null +++ b/cli/ebin/gsc_cli.app @@ -0,0 +1,7 @@ +{application,gsc_cli, + [{description,"GSC CLI and test suite"}, + {registered,[]}, + {included_applications,[]}, + {applications,[stdlib,kernel]}, + {vsn,"0.1.0"}, + {modules,[gsc_cli]}]}. diff --git a/cli/src/gsc_cli.erl b/cli/src/gsc_cli.erl new file mode 100644 index 0000000..51aaae7 --- /dev/null +++ b/cli/src/gsc_cli.erl @@ -0,0 +1,25 @@ +%%% @doc +%%% GSC CLI: gsc_cli +%%% +%%% This module is currently named `gsc_cli', but you may want to change that. +%%% Remember that changing the name in `-module()' below requires renaming +%%% this file, and it is recommended to run `zx update .app` in the main +%%% project directory to make sure the ebin/gsc_cli.app file stays in +%%% sync with the project whenever you add, remove or rename a module. +%%% @end + +-module(gsc_cli). +-vsn("0.1.0"). +-author("Peter Harpending "). +-copyright("Peter Harpending "). +-license("GPL-3.0-only"). + +-export([start/1]). + + +-spec start(ArgV) -> ok + when ArgV :: [string()]. + +start(ArgV) -> + ok = io:format("Hello, World! Args: ~tp~n", [ArgV]), + zx:silent_stop(). diff --git a/cli/zomp.meta b/cli/zomp.meta new file mode 100644 index 0000000..5e0917a --- /dev/null +++ b/cli/zomp.meta @@ -0,0 +1,18 @@ +{name,"GSC CLI"}. +{type,cli}. +{modules,[]}. +{mod,"gsc_cli"}. +{prefix,none}. +{author,"Peter Harpending"}. +{desc,"GSC CLI and test suite"}. +{package_id,{"otpr","gsc_cli",{0,1,0}}}. +{deps,[{"otpr","gsc",{0,1,0}}]}. +{key_name,none}. +{a_email,"peterharpending@qpq.swiss"}. +{c_email,"peterharpending@qpq.swiss"}. +{copyright,"Peter Harpending"}. +{file_exts,[]}. +{license,"GPL-3.0-only"}. +{repo_url,[]}. +{tags,[]}. +{ws_url,[]}. diff --git a/include/gsc.hrl b/include/gsc.hrl new file mode 100644 index 0000000..f02f96d --- /dev/null +++ b/include/gsc.hrl @@ -0,0 +1,161 @@ +% This is a header file that contains sfc's record types +% +% This is in order to +% 1. share records across modules; and, +% 2. allow external modules to just use the sfc records + +%------------------------------------------------------- +% API Types: sfc internal token representation +% +% -export_type([ +% sf_token_type/0 +% sf_token/0 +% ]). +%------------------------------------------------------- + +% +-type sfc_token_type() + :: bcom % /* ... */ + | lcom % // + | ws % whitespace + % literals + | char % 'a' + | string % "foo" + | int10 % 69_420 + | int16 % 0xDEAD_BEEF + | bytes % #DEAD_BEEF + | ak % ak_ABC + | ct % ct_ABC + | sg % sg_ABC + % kwds/variables/etc + | id % foo, foo_bar, foo_bar'baz' _'foo' + | con % Foo, Foo_Bar, FooBar + | qid % Foo.Bar.baz + | qcon % Foo.Bar.Baz + | tvar % 'foo, 'foo_bar, '_'foo'_'bar''' + % kwds ops and punct are all collapsed by + % so_scan:scan down to eg {'contract', {420, 69}} + % where {420, 69} is the source location + % these are three different parsers + | kwd % contract, interface, payable, etc + | op % "=!<>+-*/:&|?~@^" + | punct % ".." | oneof(",.;()[]{}") + % kwds and punct are kind of the same thing + % but i'll keep them separate now for my own sanity. ok + % i guess op or symbol or whatever is fine. + % + % not going to overthink. if having them separate + % becomes an issue it's easy enough to collapse. harder + % to separate afterward if collapsing is wrong. + . + + +-type sfc_pos() :: {Line :: pos_integer(), Col :: pos_integer()}. + +-record(sfc_token, + {type :: sfc_token_type(), + pos :: sfc_pos(), + string :: string()}). + +-type sfc_token() :: #sfc_token{}. + + +% tokens are in essence the "chunk boundaries" of +% the file +% +% because we have semantic whitespace, we have to be +% careful about block declarations, because we don't +% have an explicit open/close block token. blocks can +% be closed in one of two ways: +% +% 1. a new block at a previous indent level: +% switch(foo) +% // block starts here +% Bar => bar() +% Baz => baz() +% quux() // ends because indent level +% 2. it's part of some type of list: +% +% [switch(foo) +% // block starts here +% Bar => bar() +% Baz => baz(), // ends here +% switch(bizz) +% // block starts here +% Bar => bar() +% Baz => baz()] +% +% in order to avoid speculatively inserting virtual +% close tokens, at least on first write-out, we're +% going to disambiguate list notions right away + +% token groups +% lists = (_, _, _) +% | [_, _, _] +% | {_, _, _} +%-record(sfc_ast1_block, +% {indent = none :: none | pos_integer(), +% decls = none :: [sfc_ast1_decl()]}). +% +%-type sfc_ast() :: +% +%-type sfc_list_group() :: {'(', [sfc_token()], ')'} +% | {'[', [sfc_token()], ']'} +% | {'{', [sfc_token()], '}'} +% | {proof, +% . + + + +% @doc +% this one is very specific so it deserves its own +% record type: unterminated block comments at the end +% of files. these are ok in legacy sophia, so we have to +% specifically account for this error +-record(sfc_err_bcom_unterminated, + {prev_tokens :: [sfc_token()], + break_pos :: sfc_pos(), + rest :: string()}). + +-record(sfc_err_no_tokmatch, + {prev_tokens :: [sfc_token()], + break_pos :: sfc_pos(), + rest :: string()}). + + +-record(sfc_err_delims, + {past :: [sfc_token()], + open_stack :: [sfc_token()], + bad_close :: sfc_token(), + future :: [sfc_token()]}). +% FIXME +-record(sfc_err_nyi, {}). +-record(sfc_err_empty_file, {}). + +%-record(src_parse_error, +% {atom = none :: none | atom(), +% string = + +%j-record(sfc_err_gulp_ct, +%j {gulped :: + +% @doc +% generic placeholder error for now +-record(sfc_err, + {atom :: atom(), + string = none :: none | iolist(), + extra = none :: none | any()}). + +% @doc all errors SFC can return conveniently listed in +% one place +-type sfc_err() :: #sfc_err_bcom_unterminated{} + | #sfc_err_no_tokmatch{} + | #sfc_err_nyi{} + | #sfc_err_empty_file{} + | #sfc_err{}. + + +%% FIXME +-type sfc_ast() :: any(). + + diff --git a/src/gsc.erl b/src/gsc.erl index ce4d2d8..90f3475 100644 --- a/src/gsc.erl +++ b/src/gsc.erl @@ -1,23 +1,144 @@ -%%% @doc -%%% Gajumaru Sophia Compiler: gsc -%%% -%%% This module is currently named `gsc', but you may want to change that. -%%% Remember that changing the name in `-module()' below requires renaming -%%% this file, and it is recommended to run `zx update .app` in the main -%%% project directory to make sure the ebin/gsc.app file stays in -%%% sync with the project whenever you add, remove or rename a module. -%%% @end +% @doc bikeshed proctrastination head into vim warmup thing +% sophia compiler from scratch by PRH +% +% based on original sophia compiler +% +% parse layers: +% 1. sfc_tokenizer: SrcStr -> (Tokens | SigTokens) +% +% SigTokens = not comment/whitespace +% +% layers: +% a. sfc_strmatch : matches string shapes +% b. sfc_so_scan : converts to so_scan shapes +% +% 2. sfc_ast: SigTokens -> AST +% +% terminology: +% +% - `slurp`/`barf` borrowed from emacs paredit mode: +% +% slurp : (a b) c -> (a b c) +% barf : (a b c) -> a (b c) +% +% * `slurp` usually involves *transforming* input +% into a new type (e.g. slurp a token from src +% string); think of slurp as a verb meaning to +% consume and then digest +% * `barf` basically means blindly splitting off +% input +% +% @end --module(gsc). --vsn("0.1.0"). --author("Peter Harpending "). --copyright("Peter Harpending "). --license("GPL-3.0-only"). - --export([hello/0]). +% TODO: +% - barf for outputs, slurp for inputs +% - architecture needs more careful thought but only after something works +% - too fuzzy right now +% - possibly: +% - rename parser layers sequentially: +% - sfc_ +-module(sfc). --spec hello() -> ok. +-export_type([ + token/0 +]). -hello() -> - io:format("~p (~p) says \"Hello!\"~n", [self(), ?MODULE]). +-export([ + sigtokens_from_file/1, + sigtokens_from_string/1, + tokens_from_file/1, + tokens_from_string/1, + ast_from_file/1, + ast_from_string/1, + ast_from_tokens/1 +]). + +-include("$sfc_include/sfc.hrl"). + +%----------------------------------------- +% types +%----------------------------------------- + +-type token() :: sfc_token(). + +%----------------------------------------- +% functions +%----------------------------------------- + +sigtokens_from_file(X) -> + case tokens_from_file(X) of + {ok, Y} -> {ok, sfc_tokens:filter_significant(Y)}; + Err -> Err + end. + +sigtokens_from_string(X) -> + case tokens_from_string(X) of + {ok, Y} -> {ok, sfc_tokens:filter_significant(Y)}; + Err -> Err + end. + + +-spec tokens_from_file(FilePath) -> Perhaps + when FilePath :: string(), + Perhaps :: {ok, Tokens} + | {error, sfc_err() | any()}, + Tokens :: [sfc_token()]. + +tokens_from_file(FilePath) -> + case file:read_file(FilePath) of + {ok, FBytes} -> tokens_from_string(FBytes); + Error -> Error + end. + + + + +-spec tokens_from_string(SrcStr) -> Result + when SrcStr :: string(), + Result :: {ok, Tokens} + | {error, sfc_err()}, + Tokens :: [sfc_token()]. + +tokens_from_string(SrcStr) -> + sfc_tokens:tokens(SrcStr). + + + +-spec ast_from_file(FilePath) -> Perhaps + when FilePath :: string(), + Perhaps :: {ok, AST} | {error, sfc_err()}, + AST :: sfc_ast(). + +ast_from_file(FilePath) -> + case file:read_file(FilePath) of + {ok, FileBytes} -> ast_from_string(FileBytes); + Error -> Error + end. + + + +-spec ast_from_string(SrcStr) -> Perhaps + when SrcStr :: string(), + Perhaps :: {ok, AST} | {error, sfc_err()}, + AST :: sfc_ast(). + +ast_from_string(SrcStr) -> + case sfc_tokens:significant_tokens(SrcStr) of + {ok, SigTks} -> ast_from_tokens(SigTks); + Error -> Error + end. + + + +-spec ast_from_tokens(SrcTokens) -> Perhaps + when SrcTokens :: [sfc_token()], + Perhaps :: {ok, AST} | {error, sfc_err()}, + AST :: sfc_ast(). + +ast_from_tokens(Tks) -> + SigTks = sfc_tokens:filter_significant(Tks), + case sfc_ast:gulp_file(SigTks) of + {gulp, AST} -> {ok, AST}; + Error -> Error + end. diff --git a/src/gsc_ast.erl b/src/gsc_ast.erl new file mode 100644 index 0000000..6fef4f6 --- /dev/null +++ b/src/gsc_ast.erl @@ -0,0 +1,693 @@ +% @doc +% type for an SFC AST +% +% based on so_syntax.erl +% +% this is the second layer in the parsing pipeline after tokens. From +% so_syntax: +% +% ## Declarations +% +% A Sophia file consists of a sequence of *declarations* in a layout block. +% +% ```c +% File ::= Block(TopDecl) +% +% TopDecl ::= ['payable'] ['main'] 'contract' Con [Implement] '=' Block(Decl) +% | 'contract' 'interface' Con [Implement] '=' Block(Decl) +% | 'namespace' Con '=' Block(Decl) +% | '@compiler' PragmaOp Version +% | 'include' String +% | Using +% +% Implement ::= ':' Sep1(Con, ',') +% +% Decl ::= 'type' Id ['(' TVar* ')'] '=' TypeAlias +% | 'record' Id ['(' TVar* ')'] '=' RecordType +% | 'datatype' Id ['(' TVar* ')'] '=' DataType +% | 'let' Id [':' Type] '=' Expr +% | (EModifier* 'entrypoint' | FModifier* 'function') Block(FunDecl) +% | Using +% +% FunDecl ::= Id ':' Type // Type signature +% | Id Args [':' Type] '=' Block(Stmt) // Definition +% | Id Args [':' Type] Block(GuardedDef) // Guarded definitions +% +% GuardedDef ::= '|' Sep1(Expr, ',') '=' Block(Stmt) +% +% Using ::= 'using' Con ['as' Con] [UsingParts] +% UsingParts ::= 'for' '[' Sep1(Id, ',') ']' +% | 'hiding' '[' Sep1(Id, ',') ']' +% +% PragmaOp ::= '<' | '=<' | '==' | '>=' | '>' +% Version ::= Sep1(Int, '.') +% +% EModifier ::= 'payable' | 'stateful' +% FModifier ::= 'stateful' | 'private' +% +% Args ::= '(' Sep(Pattern, ',') ')' +% +% Type ::= Domain '=>' Type // Function type +% | Type '(' Sep(Type, ',') ')' // Type application +% | '(' Type ')' // Parens +% | 'unit' | Sep(Type, '*') // Tuples +% | Id | QId | TVar +% +% Domain ::= Type // Single argument +% | '(' Sep(Type, ',') ')' // Multiple arguments +% ``` +-module(sfc_ast). + +%-compile([export_all,nowarn_export_all]). +% +%%-export_type([ +%%]). +%% +%%-export([ +%% slurp/2 +%%]). +% +%-export([ +% gulp_file/1 +%]). +% +%-include("$sfc_include/sfc.hrl"). +% +%%%----------------------------- +%%% TYPES: sfc_ast +%%%----------------------------- +% +% +%% % placeholders +%-type ast_() :: any(). +%-record(ast_nyi, {tokens = none :: [sfc_token()]}). +%-type ast_nyi() :: #ast_nyi{}. +% +% +%% product type: foo * bar * baz +%% +%% stupid weird implication from bad syntax foresight trying to be +%% fancy and overload what parens do is products must always have at +%% least two operands; probably this is because `(foo)` is always the +%% same as `foo` +%% +%% - 0-tuple -> `unit` +%% - 1-tuple -> type itself +%% - 2+ -> here +%-record(ast_te_pi, +% {types = none :: none | [ast_te_()]}). +% +%-type ast_type_expr() +% :: #ast_te_fn{} % function (string, string) => string +% | #ast_te_ap{} % application map(string, int) +% | #ast_te_pi{} % product foo * bar * baz +% | #ast_te_tk{} % token string int 'a Foo.Bar.baz +% | ast_nyi(). +%-type ast_te() :: ast_type_expr(). +% +% +%% ta = type alias +%% 'type' Id ['(' TVar* ')'] '=' TypeAlias +%-record(ast_ta, +% {alias = none :: none | sfc_token(), +% tvars = none :: none | [sfc_token()], +% points_to = none :: none | ast_type_expr()}). +% +%% Decl ::= 'type' Id ['(' TVar* ')'] '=' TypeAlias +%% | 'record' Id ['(' TVar* ')'] '=' RecordType +%% | 'datatype' Id ['(' TVar* ')'] '=' DataType +%% | 'let' Id [':' Type] '=' Expr +%% | (EModifier* 'entrypoint' | FModifier* 'function') Block(FunDecl) +%% | Using +%-type decl() :: #ast_ta{} +% | #ast_nyi{}. +% +%% ['payable'] ['main'] 'contract' Con [Implement] '=' Block(Decl) +%-record(ast_ct, +% {payable = none :: none | boolean(), +% main = none :: none | boolean(), +% name = none :: none | sfc_token(), +% impls = none :: none | [sfc_token()], +% decls = none :: none | [decl()]}). +% +%-record(ast_td_iface,{}). +%-record(ast_td_ns,{}). +%-record(ast_td_compiler,{}). +%-record(ast_td_include,{}). +%-record(ast_using,{}). +% +%% @doc +%% TopDecl ::= ['payable'] ['main'] 'contract' Con [Implement] '=' Block(Decl) +%% | 'contract' 'interface' Con [Implement] '=' Block(Decl) +%% | 'namespace' Con '=' Block(Decl) +%% | '@compiler' PragmaOp Version +%% | 'include' String +%% | Using +%-type top_decl() :: #ast_ct{} +% | #ast_td_iface{} +% | #ast_td_ns{} +% | #ast_td_compiler{} +% | #ast_td_include{} +% | #ast_using{} +% . +% +%-record(ast_file, +% {top_decls :: [top_decl()]}). +%-type ast_file() :: #ast_file{}. +% +% +%-type parse_error_() :: any(). +%-record(parse_error, +% {pos = none :: none | sfc_pos(), +% msg = "" :: string(), +% subs = [] :: [parse_error_()], +% extra = none :: any()}). +%-type parse_error() :: parse_error(). +% +% +%-type ast() +% :: #ast_ct{} +% | #ast_file{} +% | #ast_nyi{} +% | #ast_ta{} +% | #ast_td_compiler{} +% | #ast_td_iface{} +% | #ast_td_include{} +% | #ast_td_ns{} +% | #ast_te_ap{} % application map(string, int) +% | #ast_te_fn{} % function (string, string) => string +% | #ast_te_pi{} % product foo * bar * baz +% | #ast_te_tk{} % token string int 'a Foo.Bar.baz +% | #ast_using{}. +% +% +% +% +% +%%%----------------------------- +%%% FUNCTIONS +%%%----------------------------- +% +%-spec gulp_file(Tokens) -> Perhaps +% when Tokens :: [sfc_token()], +% Perhaps :: {gulp, #ast_file{}} +% | {error, #parse_error{}}. +% +%gulp_file([]) -> +% {error, empty_file}; +%gulp_file(Tokens) -> +% case sfc_tokens:take_block(Tokens) of +% {Tokens, []} -> +% gulp_block(fun gulp_top_decl/1, Tokens); +% %gulp_file2([], [], Tokens); +% {A, B} -> +% StartPos = sfc_tokens:start_pos(A), +% ErrPos = sfc_tokens:start_pos(B), +% Msg = efmt("gulp_file: block starting at ~p ends at ~p instead of EOF", +% [StartPos, ErrPos]), +% {error, #parse_error{pos = ErrPos, msg = Msg}} +% end. +% +% +% +%%gulp_file2(AccOks, AccErrs, Tokens = [_ | _]) -> +%% % ItemTokens will be nonempty +%% {ItemTokens, NewTokens} = sfc_tokens:take_block_item(Tokens), +%% case gulp_top_decl(ItemTokens) of +%% {gulp, Ok} -> gulp_file2([Ok | AccOks], AccErrs, NewTokens); +%% Err -> gulp_file2(AccOks, [Err | AccErrs], NewTokens) +%% end; +%%gulp_file2(AccOks, [], []) -> +%% TopDecls = lists:reverse(AccOks), +%% {gulp, #ast_file{top_decls = TopDecls}}; +%%gulp_file2(_, AccErrs, []) -> +%% Errs = [#parse_error{pos = Pos} | _] +%% = lists:reverse(AccErrs), +%% {error, #parse_error{pos = Pos, subs = Errs}}. +% +%-spec gulp_block(GulpItem, Tokens) -> GulpedItems +% when GulpItem :: fun((ItemTokens) -> GulpedItem), +% Tokens :: [sfc_token()], +% ItemTokens :: Tokens, +% GulpedItem :: {gulp, Item} | {error, Reason}, +% GulpedItems :: {gulp, Items} | {error, Reason}, +% Items :: [Item], +% Item :: ast(), +% Reason :: any(). +%% @doc +%% this almost is the familiar `traverse' idiom from Haskell +%% +%% ghci> :t traverse +%% traverse +%% :: (Traversable t, Applicative f) => (a -> f b) -> t a -> f (t b) +%% ghci> :t mapM +%% mapM :: (Traversable t, Monad m) => (a -> m b) -> t a -> m (t b) +%% +%% (or `mapM' if youre being sloppy) +%% +%% however in our case our function is actually of +%% signature +%% +%% :: (t a -> f b) +%% +%% so this is more sophisticated +%% @end +% +%gulp_block(GulpItem, Tokens) -> +% gulp_block(GulpItem, [], [], Tokens). +% +%gulp_block(GulpItem, AccOks, AccErrs, Tokens = [_ | _]) -> +% % ItemTokens will be nonempty +% {ItemTokens, NewTokens} = sfc_tokens:take_block_item(Tokens), +% case GulpItem(ItemTokens) of +% {gulp, Ok} -> gulp_block(GulpItem, [Ok | AccOks], AccErrs, NewTokens); +% Err -> gulp_block(GulpItem, AccOks, [Err | AccErrs], NewTokens) +% end; +%gulp_block(_, AccOks, [], []) -> +% Items = lists:reverse(AccOks), +% {gulp, Items}; +%gulp_block(_, _, AccErrs, []) -> +% {error, #parse_error{subs = lists:reverse(AccErrs)}}. +% +% +% +%-spec gulp_top_decl(DeclTokens) -> Result +% when DeclTokens :: [sfc_token()], +% Result :: {gulp, ast()} | {error, any()}. +% +%% @doc +%% TopDecl ::= ['payable'] ['main'] 'contract' Con [Implement] '=' Block(Decl) +%% | 'contract' 'interface' Con [Implement] '=' Block(Decl) +%% | 'namespace' Con '=' Block(Decl) +%% | '@compiler' PragmaOp Version +%% | 'include' String +%% | Using +%% @end +%gulp_top_decl(DeclTokens) -> +% case sfc_tokens:strings(3, DeclTokens) of +% ["payable", "contract", "interface"] -> +% gulp_nyi(DeclTokens); +% ["contract", "interface" | _] -> +% gulp_nyi(DeclTokens); +% ["payable", "main", "contract"] -> +% gulp_ct2(#ast_ct{payable = true, main = true}, +% drop(3, DeclTokens)); +% ["main", "contract" | _] -> +% gulp_ct2(#ast_ct{payable = false, main = true}, +% drop(2, DeclTokens)); +% ["payable", "contract" | _] -> +% gulp_ct2(#ast_ct{payable = true, main = false}, +% drop(2, DeclTokens)); +% ["contract" | _] -> +% gulp_ct2(#ast_ct{payable = false, main = false}, +% drop(1, DeclTokens)); +% ["namespace" | _] -> +% gulp_nyi(DeclTokens); +% ["@", "compiler" | _] -> +% gulp_nyi(DeclTokens); +% ["include" | _] -> +% gulp_nyi(DeclTokens); +% ["using" | _ ] -> +% gulp_nyi(DeclTokens); +% _ -> +% % decl tokens will always be nonempty +% [H = #sfc_token{pos = ErrPos} | _] = DeclTokens, +% EMsg = efmt("gulp_top_decl: bad token: ~p; " +% "expecting one of: [payable, main, contract, namespace, @compiler, include, using]", +% [H]), +% {error, #parse_error{pos = ErrPos, +% msg = EMsg}} +% end. +% +% +%gulp_ct2(Ast, Tokens = [#sfc_token{pos = Pos} | _]) -> +% % need to pass through pos for error messages on premature end of +% % input... for now it's a fixme thing +% gulp_ct3(Ast, Tokens, Pos). +% +% +%% payable main contract X : Y, Z, W = +%% ^ +%gulp_ct3(Ast = #ast_ct{name = none}, Tokens, Pos) -> +% NewPos = Pos, %% fixme +% case Tokens of +% [Name = #sfc_token{type = con} | NewTokens] -> +% NewAst = Ast#ast_ct{name = Name}, +% gulp_ct3(NewAst, NewTokens, NewPos); +% [Name | _] -> +% {error, {fixme, {bad_contract_name, Name}}}; +% [] -> +% {error, {fixme, no_contract_name}} +% end; +%% contract X = +%% contract X : Y, Z, W = +%% ^ +%gulp_ct3(Ast = #ast_ct{impls = none}, Tokens, Pos) -> +% NewPos = Pos, %% fixme +% case slurp_ct_impls(Tokens) of +% {slurp, Impls, NewTokens} -> +% NewAst = Ast#ast_ct{impls = Impls}, +% gulp_ct3(NewAst, NewTokens, NewPos); +% Error -> +% Error +% end; +%% contract X = +%% contract X : Y, Z, W = +%% ^ +%gulp_ct3(Ast = #ast_ct{decls = none}, +% [#sfc_token{string = "="} | Tokens], +% _Pos) -> +% case gulp_block(fun gulp_decl/1, Tokens) of +% {gulp, Decls} -> +% Result = Ast#ast_ct{decls = Decls}, +% {gulp, Result}; +% Error -> +% Error +% end; +%gulp_ct3(Ast, Tokens, Pos) -> +% {error, {fixme, {nyi, gulp_ct3, [Ast, Tokens, Pos]}}}. +% +% +% +%-spec slurp_ct_impls(Tokens) -> Slurped +% when Tokens :: [sfc_token()], +% Slurped :: {slurp, Impls, NewTokens} +% | {error, Reason}, +% Impls :: Tokens, +% NewTokens :: Tokens, +% Reason :: any(). +%% @doc ": X, Y, Z = ..." ~> {slurp, [X, Y, Z], "= ..."} +% +%% FIXME: this should be a restructured a tiny bit for better error +%% handling +%slurp_ct_impls([_ = #sfc_token{string = ":"}, +% Con = #sfc_token{type = con} +% | NewTokens]) -> +% slurp_ct_impls2([Con], NewTokens); +%slurp_ct_impls(NewTokens = [#sfc_token{string = "="} | _]) -> +% {slurp, [], NewTokens}; +%slurp_ct_impls([BadToken | _]) -> +% Msg = efmt("slurp_ct_impls: expecting (: Con | =), got ~p", [BadToken]), +% {error, {fixme, Msg}}; +%slurp_ct_impls([]) -> +% {error, {fixme, "expecting : or =, got end-of-input"}}. +% +% +%slurp_ct_impls2(Stk, [_ = #sfc_token{string = ","}, +% Con = #sfc_token{type = con} +% | NewTokens]) -> +% slurp_ct_impls2([Con | Stk], NewTokens); +%slurp_ct_impls2(Stk, NewTokens = [#sfc_token{string = "="} | _]) -> +% {slurp, lists:reverse(Stk), NewTokens}; +%slurp_ct_impls2(Stk, BadTokens) -> +% {error, {fixme, nyi, slurp_ct_impls2, [Stk, BadTokens]}}. +% +% +%% Decl ::= 'type' Id ['(' TVar* ')'] '=' TypeAlias +%% | 'record' Id ['(' TVar* ')'] '=' RecordType +%% | 'datatype' Id ['(' TVar* ')'] '=' DataType +%% | 'let' Id [':' Type] '=' Expr +%% | (EModifier* 'entrypoint' | FModifier* 'function') Block(FunDecl) +%% | Using +%gulp_decl(Tokens) -> +% case sfc_tokens:strings(1, Tokens) of +% ["type"] -> gulp_type_alias(Tokens); +% _ -> gulp_nyi(Tokens) +% end. +% +%% 'type' Id ['(' TVar* ')'] '=' TypeAlias +%gulp_type_alias(Tokens) -> +% gulp_ta(#ast_ta{}, Tokens). +% +% +%% 'type' Id ['(' TVar* ')'] '=' TypeAlias +%% ^ +%gulp_ta(Ast = #ast_ta{alias = none}, +% _ = [#sfc_token{string = "type"}, Alias = #sfc_token{type = id} +% | NewTokens]) -> +% NewAst = Ast#ast_ta{alias = Alias}, +% gulp_ta(NewAst, NewTokens); +%% 'type' Id ['(' TVar* ')'] '=' TypeAlias +%% ^ +%gulp_ta(Ast = #ast_ta{tvars = none}, +% Tokens) -> +% case slurp_ta_tvars(Tokens) of +% {slurp, TVars, NewTokens} -> +% NewAst = Ast#ast_ta{tvars = TVars}, +% gulp_ta(NewAst, NewTokens); +% Error -> Error +% end; +%% 'type' Id ['(' TVar* ')'] '=' TypeAlias +%% ^ +%gulp_ta(Ast = #ast_ta{points_to = none}, +% _ = [#sfc_token{string = "="} | NewTokens]) -> +% case gulp_type_expr(NewTokens) of +% {gulp, TypeExpr} -> +% Result = Ast#ast_ta{points_to = TypeExpr}, +% {gulp, Result}; +% Error -> Error +% end; +%gulp_ta(A, B) -> +% {error, {fixme, gulp_ta, A, B, ?MODULE, ?LINE}}. +% +% +%% type foo(bar, baz) = quux +%% ^ +%% +%% "(bar, baz) = ..." ~> {slurp, [bar, baz], "= ..."} +%slurp_ta_tvars(Tks = [#sfc_token{string = "="} | _]) -> +% {slurp, [], Tks}; +%slurp_ta_tvars([#sfc_token{string = "("}, +% #sfc_token{string = ")"} +% | NewTokens]) -> +% {slurp, [], NewTokens}; +%slurp_ta_tvars([_ = #sfc_token{string = "("}, +% TVar = #sfc_token{type = tvar} +% | NewTokens]) -> +% slurp_tavars([TVar], NewTokens). +% +% +%slurp_tavars(Stk, +% [_ = #sfc_token{string = ","}, +% TVar = #sfc_token{type = tvar} +% | NewTks]) -> +% slurp_tavars([TVar | Stk], NewTks); +%slurp_tavars(Stk, +% [#sfc_token{string = ")"} +% | NewTks]) -> +% {slurp, lists:reverse(Stk), NewTks}. +% +% +%gulp_type_expr(Tokens) -> +% case slurp_type_expr(Tokens) of +% {slurp, T, []} -> {gulp, T}; +% {slurp, _, E} -> {error, {fixme, trailing, E}}; +% Error -> Error +% end. +% +% +%% Type ::= Domain '=>' Type // Function type +%% | Type '(' Sep(Type, ',') ')' // Type application +%% | '(' Type ')' // Parens +%% | 'unit' +%% | Sep(Type, '*') // Tuples +%% | Id +%% | QId +%% | TVar +%% +%% Domain ::= Type // Single argument +%% | '(' Sep(Type, ',') ')' // Multiple arguments +% +%% terminals: +%% TkType = id | qid | tvar +%% PList = PList0 | PList1 | PList2 +%% ProdType = Type [* Type]+ +%% LamType = Type [=> Type]+ +%% +%% PList0 = () +%% PList1 = (Type) +%% PList2 = (Type [, Type]+) +%% +%% Ah ok: +%% +%% Type1 = TkType | PList(Type) +%% +%% Type = Type1 % string, (string), (string, string) +%% | Type1 => Type % function +%% | Type1 PList(Type) % application +%% | Type1 * Type % product +% +% +%% each type either starts with an id/qid/tvar token or an open paren +%% +%% we're going to eliminate open paren cases first: +%% +%% legal cases: +%% +%% 1. Function types can have arbitrary #args: +%% +%% () => ... // legal +%% (_) => ... // legal +%% (_, _) => ... // legal +%% +%% 2. Otherwise it must be a plist of ONLY one type +%% +%% () // illegal, should be "unit" +%% (X) // legal, equiv to X +%% (X, Y) // illegal, should be X * Y +%% +%slurp_type_expr(Tks = [#sfc_token{pos = Pos} | _]) -> +% case slurp_type1(Tks) of +% % plist cases +% % function types _ => _ +% {slurp, {plist, PList}, NewTks} -> +% case {PList, NewTks} of +% % ANY plist (including (), (foo)) followed by => is fine +% % () => _ +% % (_) => _ +% % (_, _) => _ +% {DomainType, [#sfc_token{string = "=>"} | After]} -> +% case slurp_type_expr(After) of +% {slurp, CodomainType, NewTks} -> +% TypeExpr = #ast_te_fn{dom = DomainType, +% codom = CodomainType}, +% {slurp, TypeExpr, NewTks}; +% Error = {error, _} -> +% Error +% end; +% % WMA not a function type, so only plists with 1 type are +% % allowed +% {[One], After} -> +% slurp_type_expr2(One, After); +% % plists of 0 or 2+ not allowed +% {[], _} -> +% Msg = "() is called unit", +% {error, {fixme, ?MODULE, ?LINE, illegal_type, Pos, Msg}}; +% {_, _} -> +% Msg = "(_, _, ...) must be followed by =>; " +% "the type for tuples (A, B) is A * B", +% {error, {fixme, ?MODULE, ?LINE, illegal_type, Pos, Msg}} +% end; +% % first token is id|qid|tvar: foo, Foo.bar, 'foo +% {slurp, {token, Tk}, NewTks} -> +% case NewTks of +% % foo => bar is fine, normalized to (foo) => bar +% [#sfc_token{string = "=>"} | After] -> +% case slurp_type_expr(After) of +% {slurp, Codom, NewAfter} -> +% ArgType = #ast_te_tk{token = ArgTypeTk}, +% Dom = [ArgType], +% Result = #ast_te_fn{dom = Dom, codom = Codom}, +% {slurp, Result, NewAfter}; +% Error = {error, _} -> +% Error +% end; +% {_, _} -> +% One = #ast_te_tk{token = Tk}, +% slurp_type_expr2(One, After) +% end; +% Error = {error, _} -> +% Error +% end. +% +% +%% parsed the first type, now need to see if this is an application +%% or a product or if we're done +%slurp_type_expr2(FirstType, After) -> +% % fixme: just going to ignore products and applications for now +% {slurp, FirstType, After}. +% %case strings(1, After) of +% % ["("] -> error(nyi); +% % ["*"] -> error(nyi); +% %case After of +% % [#sfc_token{string = "("} | _] -> +% % case slurp_type1_II(After) of +% % {slurp, {plist, ArgTypes}, NewAfter} -> +% % {slurp, #ast_te_ap{fn = FirstType +% +% +% +% +%% Type1 = {plist, Types} () (foo) (foo, bar) +%% | {token, #sfc_token{}} foo Bar.baz 'quux +%slurp_type1(Tks) -> +% case sfc_tokens:slurp_plist(Tks) of +% % head token is NOT open paren -> must be id/qid/tvar +% {slurp, [], [Tk | NewTks]} -> +% TkType = Tk#sfc_token.type, +% case TkType of +% id -> {slurp, {token, Tk}, NewTks}; +% qid -> {slurp, {token, Tk}, NewTks}; +% tvar -> {slurp, {token, Tk}, NewTks}; +% _ -> {error, {fixme, illegal_type, Tk, ?MODULE, ?LINE}} +% end; +% % head token is an open paren, need to recursively consume +% % the inner types +% {slurp, PListTokens, After} -> +% case gulp_ptype1(PListTokens) of +% {gulp, Types} -> {slurp, {plist, Types}, After}; +% Error = {error, _} -> Error +% end; +% Error = {error, _} -> Error +% end. +% +% +%%slurp_type_expr_plist(Tks) -> +%% case sfc_tokens:slurp_plist(Tks) of +%% % head token is NOT open paren -> must be id/qid/tvar +%% {slurp, [], [Tk | NewTks]} -> +%% TkType = Tk#sfc_token.type, +%% case TkType of +%% id -> {slurp, {token, Tk}, NewTks}; +%% qid -> {slurp, {token, Tk}, NewTks}; +%% tvar -> {slurp, {token, Tk}, NewTks}; +%% _ -> {error, {fixme, illegal_type, Tk}} +%% end; +%% % head token is an open paren, need to recursively consume +%% % the inner types +%% {slurp, PListTokens, After} -> +%% case gulp_ptype1(PListTokens) of +%% {gulp, Types} -> {slurp, {plist, Types}, After}; +%% Error = {error, _} -> Error +%% end; +%% Error = {error, _} -> Error +%% end. +% +%gulp_ptype1([#sfc_token{string = "("}, #sfc_token{string = ")"}]) -> +% {gulp, []}; +%gulp_ptype1([#sfc_token{string = "("} | Tail]) -> +% gulp_ptype1_II([], Tail). +% +%gulp_ptype1_II(Stk, Tks) -> +% case slurp_type_expr(Tks) of +% {slurp, NewType, [#sfc_token{string = ")"}]} -> +% {gulp, lists:reverse([NewType | Stk])}; +% {slurp, NewType, [#sfc_token{string = ","} | NewTks]} -> +% gulp_ptype1_II([NewType | Stk], NewTks); +% Error = {error, _} -> +% Error +% end. +% +% +%%gulp_te_tk([Tk = #sfc_token{type = TkType}]) +%% when id =:= TkType; +%% qid =:= TkType; +%% tvar =:= TkType -> +%% {gulp, #ast_te_tk{token = Tk}}; +%%gulp_te_tk(Tks) -> +%% {error, {fixme, nyi, gulp_te_tks, Tks}}. +% +% +%gulp_nyi(Tokens) -> +% {gulp, #ast_nyi{tokens = Tokens}}. +% +% +%drop(N, [_ | Xs]) when is_integer(N), N >= 1 -> +% drop(N-1, Xs); +%drop(_, []) -> +% []; +%drop(0, X) -> +% X. +% +%efmt(Fmt, Args) -> +% unicode:characters_to_list(io_lib:format(Fmt, Args)). diff --git a/src/gsc_bst.erl b/src/gsc_bst.erl new file mode 100644 index 0000000..5e0d7fa --- /dev/null +++ b/src/gsc_bst.erl @@ -0,0 +1,85 @@ +% @doc +% sfc_bst = ast second attempt but prefix so tab complete +% +% from docs/sophia/so_syntax.md: +% +% File ::= Block(TopDecl) +% +% TopDecl ::= ['payable'] ['main'] 'contract' Con [Implement] '=' Block(Decl) +% | 'contract' 'interface' Con [Implement] '=' Block(Decl) +% | 'namespace' Con '=' Block(Decl) +% | '@compiler' PragmaOp Version +% | 'include' String +% | Using +% +% Implement ::= ':' Sep1(Con, ',') +% +% Decl ::= 'type' Id ['(' TVar* ')'] '=' TypeAlias +% | 'record' Id ['(' TVar* ')'] '=' RecordType +% | 'datatype' Id ['(' TVar* ')'] '=' DataType +% | 'let' Id [':' Type] '=' Expr +% | (EModifier* 'entrypoint' | FModifier* 'function') Block(FunDecl) +% | Using +% +% FunDecl ::= Id ':' Type // Type signature +% | Id Args [':' Type] '=' Block(Stmt) // Definition +% | Id Args [':' Type] Block(GuardedDef) // Guarded definitions +% +% GuardedDef ::= '|' Sep1(Expr, ',') '=' Block(Stmt) +% +% Using ::= 'using' Con ['as' Con] [UsingParts] +% UsingParts ::= 'for' '[' Sep1(Id, ',') ']' +% | 'hiding' '[' Sep1(Id, ',') ']' +% +% PragmaOp ::= '<' | '=<' | '==' | '>=' | '>' +% Version ::= Sep1(Int, '.') +% +% EModifier ::= 'payable' | 'stateful' +% FModifier ::= 'stateful' | 'private' +% +% Args ::= '(' Sep(Pattern, ',') ')' +-module(sfc_bst). + +-compile([export_all, nowarn_export_all]). +-include("$sfc_include/sfc.hrl"). + +%-record(bst_nyi, {tokens :: [sfc_token()]). +% +%% ['payable'] ['main'] 'contract' Con [Implement] '=' Block(Decl) +%-record(bst_ct, +% {payable = none :: none | boolean(), +% main = none :: none | boolean(), +% name = none :: none | string(), +% impl = none :: none | [string()], +% eq = none :: none | '=', +% decls = none :: none | [decl()]}). +% +%% 'contract' 'interface' Con [Implement] '=' Block(Decl) +%-record(bst_iface, +% {payable = none :: none | boolean() +% name = none :: none | string(), +% impl = none :: none | [string()], +% eq = none :: none | '=', +% decls = none :: none | [decl()]}). +% +% +% +%% File ::= Block(TopDecl) +%gulp(file, Tokens) -> +% gulp({block, top_decl}, Tokens); +%% TopDecl ::= ['payable'] ['main'] 'contract' Con [Implement] '=' Block(Decl) +%% | 'contract' 'interface' Con [Implement] '=' Block(Decl) +%% | 'namespace' Con '=' Block(Decl) +%% | '@compiler' PragmaOp Version +%% | 'include' String +%% | Using +%gulp(top_decl, [#sfc_token{string = S} | Rest]) -> +% case strings(3, Tokens) of +% ["payable", "contract", "interface"] -> +% gulp_ct(#bst_iface{payable = true, main = true}, drop(3, Tokens)); +% ["payable", "contract", _] -> +% gulp_ct(#bst_ct{payable = true, main = false}, drop(2, Tokens)); +% ["contract" | _] -> +% gulp({oneof, [bst_iface, bst_ct, bst_nyi]}, Tokens); +%gulp(bst_ct, Tokens) -> +%gulp(top_decl, Tokens) -> diff --git a/src/gsc_parse_type_expr.erl b/src/gsc_parse_type_expr.erl new file mode 100644 index 0000000..a2faca3 --- /dev/null +++ b/src/gsc_parse_type_expr.erl @@ -0,0 +1,216 @@ +-module(sfc_parse_type_expr). + +-export_type([ +]). + +-export([ + unsafe_vtks_from_string/1, + gulp_vtks/1, + take_until_ifx_op/1 +]). + +-include("$sfc_include/sfc.hrl"). + + +%------------------------------------------------------ +% TYPES +%------------------------------------------------------ + +-type vtk_ifx_op() :: vtk_apply_to + | {'vtk_*', sfc_token()} + | {'vtk_=>', sfc_token()}. + +-type vtk() :: sfc_token() + | {vtk_plist, [sfc_token()]} + | vtk_ifx_op(). + + +-type gulped(X) :: {gulp, X} + | {error, any()}. + +-type slurped(X) :: {slurp, X, Rest :: [sfc_token()]} + | {error, any()}. + + +%------------------------------------------------------ +% FUNCTIONS +%------------------------------------------------------ + + +-spec unsafe_vtks_from_string(SrcStr) -> Vtks when + SrcStr :: string(), + Vtks :: [vtk()]. + +% @doc for testing +unsafe_vtks_from_string(S) -> + {ok, SigTks} = sfc_tokens:significant_tokens(S), + {gulp, Vtks} = gulp_vtks(SigTks), + Vtks. + + +% operators in descending order of exteriority +% +% precedence verbiage confuses me +% +% 1 + 2 * 3 ^ 4 +% +% (+ 1 (* 2 (^ 3 4))) +% +% precedence is thinking about the operators as like +% having arms and legs and doing something. how much +% power do they have to bind to their neighbors. it's +% thinking of your operators as verbs in some state +% machine. +% +% i don't like when i have to model the state machine +% in my head when i'm reading code. everything should +% just be there. exteriority and interiority are +% properties of nouns, not properties of verbs. +% +% functional programming is all about offloading as +% much of your reasoning into nouns as possible. verbs +% change things. and change is bad. +-type chunk_strategy() + :: ast_parens % prefix paren (x y z) + | '=>' % infixr + | '*' % infix + | 'apply' % postfix paren f (...) + . + + +-record(ast_parens, + {open = none :: none | sfc_token(), + inner = none :: none | [sfc_token()], + close = none :: none | sfc_token()}). + +chunk_by(Strategy, Tokens) -> + chunk_by(Strategy, [], Tokens). + + +-spec chunk0(Strategy, Oks, Errs, Tokens) -> Result when + Strategy :: chunk_strategy(), + Oks :: [any()], + Errs :: [{error, Reason :: any()}], + Tokens :: [sfc_token()], + Result :: {ok, + +gulp_chunks_by(_, Stk, [], []) -> + {gulp, lists:reverse(Stk)}; +gulp_chunks_by(_, _, Errs, []) -> + {error, {fixme, {?MODULE, ?LINE}, Errs}}; +gulp_chunks_by(plist, Stk, Errs, Tokens) -> + case slurp_plist_rec(Tokens) of + {slurp, Plist, NewTokens} -> + gulp_chunks_by(plist, [PList | Stk], Errs, NewTokens); + barf -> + [Token | NewTokens] = Tokens, + gulp_chunks_by(plist, [Token | Stk], Errs, NewTokens); + Error -> + gulp_chunks_by(plist, Stk, [Error | Errs], Tokens); + end. + +slurp_plist_rec(Tokens = [#sfc_token{string = "(" | _]) -> + case sfc_tokens:slurp_plist(Tokens) of + {slurp, [], _} -> + barf; + {slurp, PTokens, NewTokens} -> + PTokensInner = pt_inner(PTokens), + end; + +%-spec gulp_ifx_tree(Tokens) -> gulped(IfxTree) when +% Tokens :: [sfc_token()], +% IfxTree :: ifx_tree(). +% +%-spec chunk_by(ChunkStrategy, Tokens) -> Result when +% ChunkStrategy :: chunk_strategy(), +% Tokens :: [sfc_token()], +% Result :: {ChunkStrategy, + + + +-spec gulp_vtks(Tokens) -> Result when + Tokens :: [sfc_token()], + Result :: gulped(VirtualTokens), + VirtualTokens :: [vtk()]. + +% @private +% for infix precedence/associativity resolution. +% +% better/original name (although misnomer) was +% insert_virtual_tokens. we're using the gulp verbiage +% because we're guarding against mistmatched delimiters +% @end +gulp_vtks(Tokens) -> + gulp_vtks([], Tokens). + +gulp_vtks(Acc, Tks0) -> + % scan until next "(" | "*" | "=>" + case take_until_ifx_op(Tks0) of + % no infix op remaining, return arg + {_Pfx = Tks0, + _Sfx = []} -> + {gulp, lists:flatten([Acc, Tks0])}; + % application + % "... foo(bar, baz) ..." + % ~> [..., foo, {plist, "(bar, baz)"}, ...] + {_Pfx = Tks1_BeforeOpen, + _Sfx = Tks2_OpenNAfter + = [#sfc_token{string = "("} | _]} -> + case sfc_tokens:slurp_plist(Tks2_OpenNAfter) of + {slurp, Tks2A_OpenToClose, Tks2B_AfterClose} -> + NewAcc = [Acc, + Tks1_BeforeOpen, + vtk_apply_to, + {vtk_plist, Tks2A_OpenToClose}], + gulp_vtks(NewAcc, Tks2B_AfterClose); + Error = {error, _} -> + Error + end; + % product + {_Pfx = Tks0_BeforeTimes, + _Sfx = [ Tk1A_Times = #sfc_token{string = "*"} + | Tks1B_AfterTimes]} -> + NewAcc = [Acc, + Tks0_BeforeTimes, + {'vtk_*', Tk1A_Times}], + gulp_vtks(NewAcc, Tks1B_AfterTimes); + % funType + {_Pfx = Tks0_BeforeOp, + _Sfx = [ Tk1A_Op = #sfc_token{string = "=>"} + | Tks1B_AfterOp]} -> + NewAcc = [Acc, + Tks0_BeforeOp, + {'vtk_=>', Tk1A_Op}], + gulp_vtks(NewAcc, Tks1B_AfterOp) + end. + + + +-spec take_until_ifx_op(Tokens) -> Result when + Tokens :: [sfc_token()], + Result :: {Taken, NewTokens}, + Taken :: Tokens, + NewTokens :: Tokens. +% @doc +% consume tokens until one of ["(", "*", "=>"] + +take_until_ifx_op(Tks) -> + take_until_ifx_op([], Tks). + +take_until_ifx_op(Stack, []) -> + {lists:reverse(Stack), []}; +take_until_ifx_op(Stack, Tokens = [Token | NewTokens]) -> + TokStr = Token#sfc_token.string, + Continue = + case TokStr of + % exit cases + "(" -> false; + "*" -> false; + "=>" -> false; + _ -> true + end, + case Continue of + true -> take_until_ifx_op([Token | Stack], NewTokens); + false -> {lists:reverse(Stack), Tokens} + end. + diff --git a/src/gsc_so_scan.erl b/src/gsc_so_scan.erl new file mode 100644 index 0000000..7ea6475 --- /dev/null +++ b/src/gsc_so_scan.erl @@ -0,0 +1,457 @@ +% @doc compatibility layer to test against so_scan +% +% converts sfc_tokens data to so_scan tokens +% +% Ref: so_scan.erl +-module(sfc_so_scan). + +-export_type([ + so_kwd/0, + so_special_char/0, + so_symbol/0, + so_token2/0, + so_token3/0, + so_token/0 +]). + +-export([ + scan/1, + ken_barson_rises/2 +]). + +-include("$sfc_include/sfc.hrl"). + +%================================ +% API: types +%================================ + +% FIXME: single-quote all the atoms to future-proof against the elixir retards +% adding more keywords to erlang +-type so_kwd() :: contract + | include + | 'let' + | switch + | type + | record + | datatype + | 'if' + | elif + | 'else' + | function + | stateful + | payable + | 'true' + | 'false' + | mod + | public + | entrypoint + | private + | indexed + | namespace + | interface + | main + | using + | as + | for + | hiding + | 'band' + | 'bor' + | 'bxor' + | 'bnot'. + +-type so_special_char() :: '..' + | ',' + | '.' + | ';' + | '(' + | ')' + | '[' + | ']' + | '{' + | '}' + . + +% @doc bad type... essentially a string that is the outcome of a regex match is +% cast to an atom, and that's the type that goes here +-type so_symbol() :: so_kwd() | so_special_char() | atom(). + +-type so_token2() :: {Symbol :: so_symbol(), + Location :: sfc_pos()}. + +% FIXME +% this is 'id', 'con', qid +-type so_tk3type() :: char | string | hex | int | bytes | qid | qcon | tvar | id | con. + +-type so_token3() :: {TokenType :: so_tk3type(), + Location :: sfc_pos(), + TokenValue :: term()}. + +-type so_token() :: so_token2() | so_token3(). + + +%================================ +% API: functions +%================================ + +-spec scan(SrcStr) -> {ok, SoTokens} | {error, sfc_err()} + when SrcStr :: iolist(), + SoTokens :: [so_token()]. +% @doc +% this is meant to agree with so_scan:scan/1 in all cases +% +% this converts sfc's internal representation of tokens into the format that +% so_scan outputs +% @end + +scan(SrcStr) -> + case sfc_tokens:tokens(SrcStr) of + {ok, SfLTokens} -> + SoTokens = to_so_tokens(SfLTokens), + {ok, SoTokens}; + % fucking stupid + {error, #sfc_err_bcom_unterminated{prev_tokens = SfcTokens}} -> + {ok, to_so_tokens(SfcTokens)}; + Error -> + Error + end. + + + +-spec to_so_tokens(SfcTokens) -> SoTokens + when SfcTokens :: [sfc_token()], + SoTokens :: [so_token()]. + +% @doc +% most sfc tokens map 1-to-1 with so_tokens. the +% exception is ak/ct/sg literals. this is a +% many-to-one-mapping, and therefore ak, sg, ct need to +% be handled at the list level. +% +% the reason is as follows: +% +% so_scan lexes ak_ABCD to an id, then at the parsing +% stage computes the pubkey that corresponds to. +% +% as a result, if we have ak_GHI, I is not a valid +% base58 char, so WE (sfc) end up lexing that as +% +% [{ak, "ak_GH"}, {con, "I"}] +% +% and so_scan lexes that as {id, "ak_GHI"}]. +% +% however we also don't ignore whitespace, so we can +% tell if this happens because it occurs precisely when +% an ak/sg/ct is immediately followed by a +% non-whitespace token. however there could be more +% than 1 and they can be a variety of different shapes. +% so we have to greedily consume them back into a +% single id. +% +% bugs in the happy path are trans-features +% +% so if we see an ak/ct/sg token, we summon evil ben +% carson to reconjoin the unconjoined twins +to_so_tokens([ AkTok = #sfc_token{type = AkCtSg, pos = Pos} + | Sheeit]) + when ak =:= AkCtSg; + ct =:= AkCtSg; + sg =:= AkCtSg -> + {#sfc_token{string = FinalAkStr}, NewSheeit} + = ken_barson_rises(AkTok, Sheeit), + [{id, Pos, FinalAkStr}| to_so_tokens(NewSheeit)]; +% this part is just lists:filtermap +to_so_tokens([X | Xs]) -> + case to_so_token(X) of + false -> to_so_tokens(Xs); + {true, SoToken} -> [SoToken | to_so_tokens(Xs)] + end; +to_so_tokens([]) -> + []. + + + +-spec ken_barson_rises(InitApiToken, SfToks) -> {FinalApiToken, NewSfToks} + when InitApiToken :: sfc_token(), + SfToks :: [sfc_token()], + FinalApiToken :: InitApiToken, + NewSfToks :: SfToks. +% @doc +% +% .-""""""""""""-. +% .-' .-======-. '-. +% .' / .----. \ '. +% / | / \ | \ +% | | | @ @ | | | +% | | | __ | | | +% | | | /@@\ | | | +% \ | | \__/ | | / +% '. | \_++++_/ | .' +% '-._| |\/\/| |_.-' +% | |/\/\| | +% | \____/ | +% ___| BEN CARSON |___ +% .-' | HAS BECOME | '-. +% / | TOO POWERFUL| \ +% / |______________| \ +% / .-'''-. .-'''-. \ +% | / .-. \ / .-. \ | +% | | ( ) | | ( ) | | +% | \ '-' / \ '-' / | +% \ '-...-' /\ '-...-' / +% '._ / \ _.' +% '-._____.-' '-._____.-' +% +% THE SOFT-SPOKEN DOOM DOCTOR +% “I prescribed… CHAOS.” +% +% BUGS IN THE HAPPY PATH ARE features. +% BUGS IN THE HAPPY PATH ARE features. +% +% WE LIKE features. +% +% features MAKE US MONEY. +% +% features ARE NOT FOOD. +% features ARE friends. +% @end + + + +% This function takes the unconjoined twins (e.g. +% `ak_GHI` lexed to `ak_GH` followed by `I`) and +% recursively reconjoins them so they can all live +% happily together as a single so_scan token which will +% fail in the parsing step. +% +% on account of the property that the concatenation of +% all the token strings equals the original source file +% (FIXME: should test this in test suite) +% +% basically this looks at the next token, and if it's a +% type that so_scan is going to consume as part of an +% `id` token, then we add it to the stack. +% +% quoth claude: +% so_scan lexes identifiers with +% /[a-z_][a-zA-Z0-9_']*/. The base58 alphabet used by +% `smr_apistr58` is: +% +% 123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz +% +% Characters **valid in a so_scan id tail** but +% **absent from base58**: +% +% | char | why excluded from base58 | +% |------|---------------------------------------| +% | `0` | looks like `O` | +% | `I` | looks like `1` or `l` | +% | `O` | looks like `0` | +% | `l` | looks like `1` or `I` | +% | `_` | not alphanumeric (structural, not b58)| +% | `'` | not alphanumeric (sophia id quirk) | +% +% When any of these appear AFTER at least one valid +% base58 char in a `ak_`/`ct_`/`sg_` prefixed +% identifier, `sfc` splits what `so_scan` sees as one +% `id` token into 2+ sfc tokens. +% +% **No split if non-base58 char is immediately after +% `_`**: `smr_plus` requires >=1 base58 char to +% match; `ak_I`, `ak_0`, `ak__bar` all fall +% through to `id` and both tokenizers agree. +ken_barson_rises(AkTokAcc = #sfc_token{string = AkStr}, + SrcTokens = [#sfc_token{type = CandidateType, + string = CandidateString} + | Rest]) -> + % candidate: + % dig out the token type and the string + Smash = lists:member(CandidateType, smash_types()), + Pass = lists:member(CandidateType, pass_types()), + % sanity check + true = Smash or Pass, + if + Smash -> + % dig out the token from LcTokApi + NewAkStr = AkStr ++ CandidateString, + NewAkTokAcc = AkTokAcc#sfc_token{string = NewAkStr}, + ken_barson_rises(NewAkTokAcc, Rest); + Pass -> + {AkTokAcc, SrcTokens} + end; +ken_barson_rises(Done, []) -> + {Done, []}. + +smash_types() -> + [char, % ak_GH'a' -> {char, "'a'"} + int16, % ak_GH0xAB -> {int16, "0xAB"} + int10, % ak_GH0123 -> {int10, "0123"} + tvar, % ak_GH'a -> {tvar, "'a"} + kwd, % ak_GHlet -> {kwd, "let"} + id, % ak_GH_AB -> {id, "_AB"} + con]. % ak_GHI -> {con, "I"} + +pass_types() -> + % why each of these are impossible + % meaning the prefix for each of these will cause + % so_scan to break out of consuming an id, or will + % never be a disjoined + % neighbor + [lcom, % ak_AB// breaks out of id + bcom, % ak_AB/* breaks out of id + ws, % ak_AB\t breaks out of id + punct, % ak_AB{ breaks out of id + string, % ak_AB" breaks out of id + bytes, % ak_AB# breaks out of id + ak,ct,sg, % ak_ABak [akctsg] all in base58 alphabet + qid, % ak_ABI.Am.A.qid ??? maybe sophia lexes this to [{id, _}, '.']? + qcon, % ak_ABI.Am.A.QCon ??? same + op]. % ak_AB=< [=!<>+-*/:&|?~@^] break out of id + + + +-spec to_so_token(SfcToken) -> MaybeSoToken + when SfcToken :: sfc_token(), + MaybeSoToken :: {true, SoToken} + | false, + SoToken :: so_token(). + +% @private +% does NOT handle ak/ct/sg because these may consume +% follow-on tokens +% @end + +to_so_token(#sfc_token{type = SfTokenType, + pos = Pos, + string = SfTokenStr}) -> + case SfTokenType of + %----------------- + % Ignored + %----------------- + bcom -> false; + lcom -> false; + ws -> false; + %----------------------- + % {_, _} + % + % {contract, {420, 69}} + %----------------------- + % kwds ops and punct are all collapsed by + % so_scan:scan down to eg {'contract', {420, 69}} + % where {420, 69} is the source location + % these are three different parsers + Sym when Sym =:= kwd; + Sym =:= op; + Sym =:= punct -> + Symbol = list_to_atom(SfTokenStr), + {true, {Symbol, Pos}}; + %------------------------------------ + % {_, _, _} + % + % {id, {420, 69}, "foo"} + %-------------------------------- + QVar when QVar =:= qid; QVar =:= qcon -> + % qualifieds tokenize to + % {qid, {420, 69}, ["Foo", "Bar", "baz"]} + {true, {QVar, Pos, string:tokens(SfTokenStr, ".")}}; + SfVar when SfVar =:= id; SfVar =:= con; SfVar =:= tvar -> + {true, {SfVar, Pos, SfTokenStr}}; + % literals + % from so_scan: + % {CHAR, token(char, fun parse_char/1)} + % {STRING, token(string, fun parse_string/1)} + % {HEX, token(hex, fun parse_hex/1)} + % {INT, token(int, fun parse_int/1)} + % {BYTES, token(bytes, fun parse_bytes/1)} + % so_scan casts strings to binary + char -> {true, {char, Pos, so_parse_char(SfTokenStr)}}; + string -> {true, {string, Pos, so_parse_string(SfTokenStr)}}; + int16 -> {true, {hex, Pos, so_parse_hex(SfTokenStr)}}; + int10 -> {true, {int, Pos, so_parse_int(SfTokenStr)}}; + bytes -> {true, {bytes, Pos, so_parse_bytes(SfTokenStr)}}; + NYI -> + Msg = io_lib:format("sfc_so_scan:to_so_token/1: unhandled token shape: ~p", [NYI]), + error(#sfc_err{atom = nyi, + string = Msg}) + end. + +%% ak/ct/sg all tokenize to id +%% FIXEDME: implement? it seems like so_scan just parses these as +%% identifiers, so not clear what the advantage is here? +%% +%% i suppose we'll find out when we write the syntax parser +%% +%% so_scan lexes ak/ct/sg as ids and then parses them as addresses/sigs +%% in the parsing step +%API when API =:= ak; +% API =:= ct; +% API =:= sg -> +% {true, {id, Pos, SfTokenStr}}; + +% copied from so_scan.erl +so_parse_char([$' | Chars]) -> + case unicode:characters_to_nfc_list(unescape($', Chars, [])) of + [Char] -> Char; + _Bad -> + error(#sfc_err{atom = bad_token, + string = "Bad character literal: '" ++ Chars}) + end. + +so_parse_string([$" | Chars]) -> + unicode:characters_to_nfc_binary(unescape(Chars)). + +% FIXME: unfuck this shit +% +% this all works in some stupid fucking way because +% so_scan operates on lists of bytes, rather than on +% character-lists. So single codepoints have to be +% converted to multi-byte sequences or some shit. We're +% always working on lists, so this can probably be +% simplified. I don't care enough at the moment to fix +% this, but this function has been the source of +% several annoying bugs +unescape(Str) -> unescape($", Str, []). + +unescape(Delim, [Delim], Acc) -> + unicode:characters_to_binary(lists:reverse(Acc)); +unescape(Delim, [$\\, $x, ${ | Chars ], Acc) -> + {Ds, [_ | Cs]} = lists:splitwith(fun($}) -> false ; (_) -> true end, Chars), + C = list_to_integer(Ds, 16), + Utf8Cs = unicode:characters_to_nfc_list([C]), + unescape(Delim, Cs, [Utf8Cs | Acc]); +unescape(Delim, [$\\, $x, D1, D2 | Chars ], Acc) -> + C = list_to_integer([D1, D2], 16), + Utf8Cs = unicode:characters_to_nfc_list([C]), + unescape(Delim, Chars, [Utf8Cs | Acc]); +unescape(Delim, [$\\, Code | Chars], Acc) -> + Ok = fun(C) -> unescape(Delim, Chars, [C | Acc]) end, + case Code of + Delim -> Ok(Delim); + $\\ -> Ok($\\); + $b -> Ok($\b); + $e -> Ok($\e); + $f -> Ok($\f); + $n -> Ok($\n); + $r -> Ok($\r); + $t -> Ok($\t); + $v -> Ok($\v); + _ -> error(#sfc_err{atom = bad_escape_char, + string = "Bad control sequence: \\" ++ [Code]}) %% TODO + end; +unescape(Delim, [C | Chars], Acc) -> + unescape(Delim, Chars, [C | Acc]). + + +so_parse_hex("0x" ++ S) -> + list_to_integer(strip_underscores(S), 16). + +so_parse_int(S) -> + list_to_integer(strip_underscores(S)). + +so_parse_bytes("#" ++ S0) -> + S = strip_underscores(S0), + N = list_to_integer(S, 16), + Digits = (length(S) + 1) div 2, + <>. + +strip_underscores(S) -> + lists:filter(fun(C) -> C /= $_ end, S). diff --git a/src/gsc_strmatch.erl b/src/gsc_strmatch.erl new file mode 100644 index 0000000..68eb42f --- /dev/null +++ b/src/gsc_strmatch.erl @@ -0,0 +1,881 @@ +% @doc +% A string matcher is roughly analogous to a regex. It describes a pattern, +% which a string may or may not match. +% +% This module is essentially a pure erlang implementation of the subset of +% regular expressions that are needed to tokenize sophia. +% +% The intent for now (May 2026) is simply to perfectly mimic the so_scan library +% +% Reference is `docs/sophia_syntax.md` as well as `src/so_scan_lib.erl` in +% original sophia lib +% +% From docs/sophia_syntax.md: +% +% - Id = [a-z_][A-Za-z0-9_']* identifiers start with a lower case letter. +% - Con = [A-Z][A-Za-z0-9_]* constructors start with an upper case letter. +% - QId = (Con\.)+Id qualified identifiers (e.g. `Map.member`) +% - QCon = (Con\.)+Con qualified constructor +% - TVar = 'Id type variable (e.g `'a`, `'b`) +% - Int = [0-9]+(_[0-9]+)*|0x[0-9A-Fa-f]+(_[0-9A-Fa-f]+)* integer literal with optional `_` separators +% - Bytes = #[0-9A-Fa-f]+(_[0-9A-Fa-f]+)* byte array literal with optional `_` separators +% - String` string literal enclosed in " with escape character `\` +% - Char character literal enclosed in ' with escape character `\` +% - AccountAddress base58-encoded 32 byte account pubkey with `ak_` prefix +% - ContractAddress base58-encoded 32 byte contract address with `ct_` prefix +% - Signature base58-encoded 64 byte cryptographic signature with `sg_` prefix +% +% Sophia's notion of tokens also includes keywords, parens, whitespace, etc. +% Real reference is of course the code: +% +% Number = fun(Digit) -> [Digit, "+(_", Digit, "+)*"] end, +% DIGIT = "[0-9]", +% HEXDIGIT = "[0-9a-fA-F]", +% LOWER = "[a-z_]", +% UPPER = "[A-Z]", +% CON = [UPPER, "[a-zA-Z0-9_]*"], +% INT = Number(DIGIT), +% HEX = ["0x", Number(HEXDIGIT)], +% BYTES = ["#", Number(HEXDIGIT)], +% WS = "[\\000-\\ ]+", +% ID = [LOWER, "[a-zA-Z0-9_']*"], +% TVAR = ["'", ID], +% QID = ["(", CON, "\\.)+", ID], +% QCON = ["(", CON, "\\.)+", CON], +% OP = "[=!<>+\\-*/:&|?~@^]+", +% %% Five cases for a character +% %% * 1 7-bit ascii, not \ or ' +% %% * 2-4 8-bit values (UTF8) +% %% * \ followed by a known modifier [aernrtv] +% %% * \xhh +% %% * \x{hhh...} +% CHAR = "'(([\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f])|([\\x00-\\xff][\\x80-\\xff]{1,3})|(\\\\[befnrtv'\\\\])|(\\\\x[0-9a-fA-F]{2,2})|(\\\\x\\{[0-9a-fA-F]*\\}))'", +% STRING = "\"([^\"\\\\]|(\\\\.))*\"", +% +% CommentStart = {"/\\*", push(comment, skip())}, +% CommentRules = +% [ CommentStart +% , {"\\*/", pop(skip())} +% , {"[^/*]+|[/*]", skip()} ], +% +% Keywords = ["contract", "include", "let", "switch", "type", "record", "datatype", "if", "elif", "else", "function", +% "stateful", "payable", "true", "false", "mod", "public", "entrypoint", "private", "indexed", "namespace", +% "interface", "main", "using", "as", "for", "hiding", "band", "bor", "bxor", "bnot" +% ], +% KW = string:join(Keywords, "|"), +% +% There is a lot going on in that code. This is purely the part that matches +% strings specifically, . The *tokenizer* (sfc_tokenizer) knows the hierarchy +% of sophia tokens (e.g. it knows to match keywords before identifiers, so that +% `contract` gets tokenized as a keyword and not a variable name), and then +% calls into this module in order to match the string shape it's looking for. +% @end +-module(sfc_strmatch). + +%-compile([export_all, nowarn_export_all]). + + +-export_type([ + string_matcher/0 +]). + +% given a string matcher and a string, determine match or no +-export([ + match/2 +]). + +% string matchers for sophia token shapes +-export([ + smr_sf_ws/0, + smr_sf_op/0, + smr_sf_punct/0, + smr_sf_id/0, + smr_sf_con/0, + smr_sf_qid/0, + smr_sf_qcon/0, + smr_sf_tvar/0, + smr_sf_int16/0, + smr_sf_int10/0, + smr_sf_bytes/0, + smr_sf_str/0, + smr_sf_char/0, + smr_sf_ak/0, + smr_sf_ct/0, + smr_sf_sg/0 +]). + + +% regex primitives/combinators +-export([ + % plumbing + smr_char/1, + smr_char_range/2, + smr_union/1, + smr_seq/1, + smr_plus/1, + smr_star/1, + smr_dot/0, + smr_ncmatch/2, + % porcelain + smr_string/1, + smr_oneofchars/1 +]). + + + +%%======================================================================= +%% API: Types +%%======================================================================= + +-type string_matcher() + :: {smr_char, integer()} % /a/, /b/, /cd/ + | {smr_char_range, integer(), integer()} % /[a-z]/ + | {smr_union, [string_matcher()]} % /[abc]/ + | {smr_seq, [string_matcher()]} % /abc/ + | {smr_plus, string_matcher()} % /(abc)+/ + | {smr_star, string_matcher()} % /(abc)*/ + | smr_dot % /./ + % negative conditional match + % /[^a-z]/, but more general + % /[^a-z]/ <~> smr_ncmatch(smr_char_range($a, $z), smr_dot()). + | {smr_ncmatch, MustNotMatch :: string_matcher(), + Match :: string_matcher()}. + + +%========================================================= +% API: Functions +%========================================================= + +%--------------------------------------------------------- +% API: string matching logic +% +% -export([ +% match/2 +% ]). +%--------------------------------------------------------- + +-spec match(Matcher, Source) -> MaybeMatch + when Matcher :: string_matcher(), + Source :: iolist(), + MaybeMatch :: {strmatch, Matched :: string(), Rest :: string()} + | no_strmatch. +% @doc +% normalize input to an nfc list before parsing +% +% match(Matcher, Source) -> +% string_match(Matcher, unicode:characters_to_nfc_list(Source)). +% @end +match(Matcher, Source) -> + string_match(Matcher, unicode:characters_to_nfc_list(Source)). + + +%--------------------------------------------------------- +% API: string matchers for sophia tokens +% +% -export([ +% smr_sf_ws/0, +% smr_sf_op/0, +% smr_sf_punct/0, +% smr_sf_id/0, +% smr_sf_con/0, +% smr_sf_qid/0, +% smr_sf_qcon/0, +% smr_sf_tvar/0, +% smr_sf_int16/0, +% smr_sf_int10/0, +% smr_sf_bytes/0, +% smr_sf_str/0, +% smr_sf_char/0, +% smr_sf_ak/0, +% smr_sf_ct/0, +% smr_sf_sg/0 +% ]). +%--------------------------------------------------------- + +-spec smr_sf_ws() -> string_matcher(). +% @doc +% String matcher for whitespace +% +% from so_scan.erl (9.0.0) +% +% WS = "[\\000-\\ ]+", +% +% turns out all the ascii codepoints which are 32 or lower are control chars or +% whitespace: https://www.asciitable.com/ +% @end +smr_sf_ws() -> + WhitespaceChars = lists:seq(0, 32), + smr_plus(smr_oneofchars(WhitespaceChars)). + + + +-spec smr_sf_op() -> string_matcher(). +% @doc +% String matcher for a sophia operator +% +% from so_scan.erl (9.0.0) +% +% OP = "[=!<>+\\-*/:&|?~@^]+", +% @end +smr_sf_op() -> + SfOpChars = "=!<>+-*/:&|?~@^", + SfOpChar = smr_union([smr_char(C) || C <- SfOpChars]), + smr_plus(SfOpChar). + + + +-spec smr_sf_punct() -> string_matcher(). +% @doc +% String matcher for parens/braces +% +% from so_scan.erl (9.0.0) +% +% , {"\\.\\.|[,.;()\\[\\]{}]", symbol()} +% @end + +smr_sf_punct() -> + M_DotDotOp = smr_string(".."), + M_PunctChars = smr_oneofchars(",.;()[]{}"), + smr_union([M_DotDotOp, M_PunctChars]). + + + +-spec smr_sf_id() -> string_matcher(). +% @doc +% String matcher for a sophia identifier +% +% foo +% _foo +% fooBar' +% +% - Id = [a-z_][A-Za-z0-9_']* identifiers start with a lower case letter. +% @end + +smr_sf_id() -> + % upper lower digit under quote + ULDUQ = + {smr_union, [{smr_char_range, $A, $Z}, + {smr_char_range, $a, $z}, + {smr_char_range, $0, $9}, + {smr_char, $_}, + {smr_char, $'}]}, + smr_seq([smr_union([smr_char_range($a, $z), smr_char($_)]), + smr_star(ULDUQ)]). + + + +-spec smr_sf_con() -> string_matcher(). +% @doc +% String matcher for a sophia constructor name +% +% Foo +% Foo_Bar +% Foo_Bar3_' +% +% - Con = [A-Z][A-Za-z0-9_']* constructors start with an upper case letter. +% @end + +smr_sf_con() -> + ULDU = + {smr_union, [{smr_char_range, $A, $Z}, + {smr_char_range, $a, $z}, + {smr_char_range, $0, $9}, + {smr_char, $_}]}, + smr_seq([smr_char_range($A, $Z), + smr_star(ULDU)]). + + + +-spec smr_sf_qid() -> string_matcher(). + +% @doc +% String matcher for a Sophia qualified identifier +% +% Foo.Bar.Baz.quux +% +% - QId = (Con\.)+Id qualified identifiers (e.g. `Map.member`) +% @end +smr_sf_qid() -> + Qualifier = smr_seq([smr_sf_con(), smr_char($.)]), + Qualifiers = smr_plus(Qualifier), + Identifier = smr_sf_id(), + smr_seq([Qualifiers, Identifier]). + + + +-spec smr_sf_qcon() -> string_matcher(). + +% @doc +% +% String matcher for a sophia qualified constructor +% +% Foo.Bar.Baz +% +% - QCon = (Con\.)+Con qualified constructor +% @end +smr_sf_qcon() -> + Qualifier = smr_seq([smr_sf_con(), smr_char($.)]), + Qualifiers = smr_plus(Qualifier), + Constructor = smr_sf_con(), + smr_seq([Qualifiers, Constructor]). + + + +-spec smr_sf_tvar() -> string_matcher(). + +% @doc +% String matcher for a sophia type variable; e.g. +% +% 'a +% 'foo_bar +% +% - TVar = 'Id type variable (e.g `'a`, `'b`) +% @end +smr_sf_tvar() -> + smr_seq([smr_char($'), smr_sf_id()]). + + + +-spec smr_sf_int16() -> string_matcher(). + +% @doc +% String matcher for a sophia base16 integer 0xDEAD_BEEF +% +% so_scan parses base10/base16 in one go, but i think it's clearer if they're +% different +% +% - Int = [0-9]+(_[0-9]+)*|0x[0-9A-Fa-f]+(_[0-9A-Fa-f]+)* integer literal with optional `_` separators +% @end +smr_sf_int16() -> + HexDigit = smr_union([smr_char_range($0, $9), + smr_char_range($A, $F), + smr_char_range($a, $f)]), + HexDigits = smr_plus(HexDigit), + UHexDigits = smr_seq([smr_char($_), HexDigits]), + smr_seq([smr_string("0x"), HexDigits, smr_star(UHexDigits)]). + + + +-spec smr_sf_int10() -> string_matcher(). + +% @doc +% string matcher for a sophia base 10 int 012_345_6_7 +% +% so_scan parses base10/base16 in one go, but i think it's clearer if they're +% different +% +% - Int = [0-9]+(_[0-9]+)*|0x[0-9A-Fa-f]+(_[0-9A-Fa-f]+)* integer literal with optional `_` separators +% @end +smr_sf_int10() -> + DecDigit = smr_char_range($0, $9), + DecDigits = smr_plus(DecDigit), + UDecDigits = smr_seq([smr_char($_), DecDigits]), + smr_seq([DecDigits, smr_star(UDecDigits)]). + + + +-spec smr_sf_bytes() -> string_matcher(). + +% @doc +% String matcher for a sophia bytestring +% +% #DEAD_BEEF +% +% - Bytes = #[0-9A-Fa-f]+(_[0-9A-Fa-f]+)* byte array literal with optional `_` separators +% @end +smr_sf_bytes() -> + HexDigit = smr_union([smr_char_range($0, $9), + smr_char_range($A, $F), + smr_char_range($a, $f)]), + HexDigits = smr_plus(HexDigit), + UHexDigits = smr_seq([smr_char($_), HexDigits]), + smr_seq([smr_char($#), HexDigits, smr_star(UHexDigits)]). + + + +-spec smr_sf_str() -> string_matcher(). + +% @doc +% String matcher for sophia string literal +% +% String string literal enclosed in " with escape character `\` +% +% STRING = "\"([^\"\\\\]|(\\\\.))*\"", +% @end +smr_sf_str() -> + smr_seq([smr_char($"), smr_star(smr_sf_strchar()), smr_char($")]). + + +-spec smr_sf_strchar() -> string_matcher(). +% @private +% string matcher for a character in a sophia string +% +% STRING = "\"([^\"\\\\]|(\\\\.))*\"", +% +% this is for +% +% ([^\"\\\\]|(\\\\.)) +% +% cleaned up: +% +% ([^"\\]|(\\.)) +% @end +smr_sf_strchar() -> + % cannot have a literal newline in string + % + % "foo + % bar" + % + % is not a valid sophia string + AnythingButNewline = smr_ncmatch(smr_char($\n), smr_dot()), + IsASpecialChar = smr_union([smr_char($"), smr_char($\\)]), + NotEscSeq = smr_ncmatch(IsASpecialChar, AnythingButNewline), + % FIXME: maybe we should enfore escape sequence rules here? + % + % especially to be consistent with char rules + EscSeq = smr_seq([smr_char($\\), AnythingButNewline]), + smr_union([NotEscSeq, EscSeq]). + + + +-spec smr_sf_char() -> string_matcher(). +% @doc +% String matcher for a Sophia char literal +% +% From so_scan.erl: +% +% %% Five cases for a character +% %% * 1 7-bit ascii, not \ or ' +% %% * 2-4 8-bit values (UTF8) +% %% * \ followed by a known modifier [aernrtv] +% %% * \xhh +% %% * \x{hhh...} +% CHAR = "'(([\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f])|([\\x00-\\xff][\\x80-\\xff]{1,3})|(\\\\[befnrtv'\\\\])|(\\\\x[0-9a-fA-F]{2,2})|(\\\\x\\{[0-9a-fA-F]*\\}))'", +% +% > Char character literal enclosed in ' with escape character `\` +% @end + +% ok we get this monstrosity +% +% "'(([\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f])|([\\x00-\\xff][\\x80-\\xff]{1,3})|(\\\\[befnrtv'\\\\])|(\\\\x[0-9a-fA-F]{2,2})|(\\\\x\\{[0-9a-fA-F]*\\}))'" +% +% there's like 4 levels of escaping and shit, so let's break it down. First +% let's notice this pattern: +% +% '(...)'. +% +% So let's make a hole +smr_sf_char() -> + smr_seq([smr_char($'), smr_sf_char_inner(), smr_char($')]). + +% smr_sf_char_inner() will deal with the stuff in the monstrosity +% +% we had this before +% "'(([\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f])|([\\x00-\\xff][\\x80-\\xff]{1,3})|(\\\\[befnrtv'\\\\])|(\\\\x[0-9a-fA-F]{2,2})|(\\\\x\\{[0-9a-fA-F]*\\}))'" +% +% let's trim +% ([\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f])|([\\x00-\\xff][\\x80-\\xff]{1,3})|(\\\\[befnrtv'\\\\])|(\\\\x[0-9a-fA-F]{2,2})|(\\\\x\\{[0-9a-fA-F]*\\}) +% +% and reorg +% ([\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f]) +% | ([\\x00-\\xff][\\x80-\\xff]{1,3}) +% | (\\\\[befnrtv'\\\\]) +% | (\\\\x[0-9a-fA-F]{2,2}) +% | (\\\\x\\{[0-9a-fA-F]*\\}) +% +% trim some more +% [\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f] +% | [\\x00-\\xff][\\x80-\\xff]{1,3} +% | \\\\[befnrtv'\\\\] +% | \\\\x[0-9a-fA-F]{2,2} +% | \\\\x\\{[0-9a-fA-F]*\\} +% +% undo some escapes +% [\x00-\x26\x28-\x5b\x5d-\x7f] +% | [\x00-\xff][\x80-\xff]{1,3} +% | \\[befnrtv'\\] +% | \\x[0-9a-fA-F]{2,2} +% | \\x\{[0-9a-fA-F]*\} +% +% rewrite +% [^'\] <~> (16#00..16#26 | 16#28..16#5b | 16#5d..16#7f) +% <<_:8, (_ >= 128){1,3}>> <~> [\x00-\xff][\x80-\xff]{1,3} +% <<$\\, X>> <~> \\[befnrtv'\\] +% \xAB <~> \\x[0-9a-fA-F]{2,2} +% \x{DEADBEEF} <~> \\x\{[0-9a-fA-F]*\} + +smr_sf_char_inner() -> + Escapable = smr_oneofchars("befnrtv'\\"), + EscSeq = smr_seq([smr_char($\\), Escapable]), + HexChar = smr_oneofchars("0123456789ABCDEFabcdef"), + HexEsc2 = smr_seq([smr_string("\\x"), HexChar, HexChar]), + HexEsc = smr_seq([smr_string("\\x{"), smr_star(HexChar), smr_char($})]), + % FIXME: possible erroneous oversimplification here + QuoteOrBackslash = smr_oneofchars([$', $\\]), + Utf8Char = smr_ncmatch(QuoteOrBackslash, smr_dot()), + smr_union([EscSeq, HexEsc2, HexEsc, Utf8Char]). + + + +-spec smr_sf_ak() -> string_matcher(). +% @doc +% string matcher for +% +% ak_.... +% +% sophia's tokenizer tokenizes ak_.../sg_... etc as identifiers and then in the +% parsing stage disambiguates them +% +% i don't like that, but for version 0.1 we're going to match the behavior of +% `so_scan` exactly, just for clarity +% +% however, note that is the token step, we can still write a string matcher to +% be useful later +% +% > AccountAddress base58-encoded 32 byte account pubkey with `ak_` prefix +% @end +smr_sf_ak() -> + smr_apistr58("ak"). + + + +-spec smr_sf_ct() -> string_matcher(). +% @doc +% string matcher for +% +% ct_.... +% +% sophia's tokenizer tokenizes ak_.../sg_... etc as identifiers and then in the +% parsing stage disambiguates them +% +% i don't like that, but for version 0.1 we're going to match the behavior of +% `so_scan` exactly, just for clarity +% +% however, note that is the token step, we can still write a string matcher to +% be useful later +% +% > ContractAddress base58-encoded 32 byte contract address with `ct_` prefix +% @end +smr_sf_ct() -> + smr_apistr58("ct"). + + + +-spec smr_sf_sg() -> string_matcher(). +% @doc +% string matcher for +% +% sg_.... +% +% sophia's tokenizer tokenizes ak_.../sg_... etc as identifiers and then in the +% parsing stage disambiguates them +% +% i don't like that, but for version 0.1 we're going to match the behavior of +% `so_scan` exactly, just for clarity +% +% however, note that is the token step, we can still write a string matcher to +% be useful later +% +% > Signature base58-encoded 64 byte cryptographic signature with `sg_` prefix +% @end +smr_sf_sg() -> + smr_apistr58("sg"). + + +-spec smr_apistr58(Prefix) -> string_matcher() + when Prefix :: string(). +% @private +% string matcher for +% +% ak_... +% ct_... +% sg_... +% +% prefix is given as arg +% +% ... are base58 chars +% @end +smr_apistr58(Prefix) -> + smr_seq([smr_string(Prefix), smr_char($_), smr_plus(smr_base58char())]). + + +smr_base58char() -> + smr_oneofchars("123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz"). + + + + + + +%--------------------------------------------------------- +% API: string matcher primitive constructors +%--------------------------------------------------------- + +-spec smr_char(Char) -> string_matcher() + when Char :: integer(). +% @doc +% string matcher for a specific char +% +% /[abc]/ <~> smr_union([smr_char($a), smr_char($b), smr_char($c)]) +% +% @end +smr_char(X) when is_integer(X) -> + {smr_char, X}. + + + +-spec smr_char_range(LowerBound, UpperBound) -> string_matcher() + when LowerBound :: integer(), + UpperBound :: integer(). +% @doc +% string matcher for a range of characters +% +% /[a-z]/ <~> smr_char_range($a, $z) +% /[0-9]/ <~> smr_char_range($0, $9) +% @end +smr_char_range(X, Y) when is_integer(X), is_integer(Y) -> + {smr_char_range, X, Y}. + + + +-spec smr_union(StringMatchers) -> string_matcher() + when StringMatchers :: [StringMatcher], + StringMatcher :: string_matcher(). +% @doc +% String matcher that matches on the first matcher given that matches +% +% /[abc]/ <~> smr_union([smr_char($a), smr_char($b), smr_char($c)]) +% /(foo|bar)/ <~> smr_union([smr_string("foo"), smr_string("bar")]) +% @end +smr_union(List) when is_list(List) -> + {smr_union, List}. + + + +-spec smr_seq(StringMatchers) -> string_matcher() + when StringMatchers :: [string_matcher()]. +% @doc +% Match a sequence of matchers +% +% /abc/ <~> smr_seq([smr_char($a), smr_char($b), smr_char($c)]) +% +% smr_string/1 just maps to a sequence of chars +% @end +smr_seq(List) when is_list(List) -> + {smr_seq, List}. + + + +-spec smr_plus(Matcher) -> string_matcher() + when Matcher :: string_matcher(). +% @doc +% "one or more of"; like the `+` operator in regexes. +% +% sm_plus(SMR, Src0) -> +% case string_match(SMR, Src0) of +% {strmatch, Str, Src1} -> sm_star(SMR, Str, Src1); +% no_strmatch -> no_strmatch +% end. +% @end +smr_plus(SMR) -> + {smr_plus, SMR}. + + + +-spec smr_star(Matcher) -> string_matcher() + when Matcher :: string_matcher(). +% @doc +% "zero or more of"; like the `*` operator in regexes. +% +% sm_star(SMR, Acc, Src0) -> +% case string_match(SMR, Src0) of +% % 0 +% no_strmatch -> +% {strmatch, unicode:characters_to_list(Acc), Src0}; +% % or more +% {strmatch, Str, Src1} -> +% sm_star(SMR, [Acc, Str], Src1) +% end. +% @end +smr_star(SMR) -> + {smr_star, SMR}. + + + +-spec smr_dot() -> string_matcher(). +% @doc +% matches every character; analogous to /./ +% +% string_match(smr_dot, SrcStr) -> +% case SrcStr of +% [C | Rest] -> {strmatch, [C], Rest}; +% [] -> no_strmatch +% end; +% @end +smr_dot() -> + smr_dot. + + + +-spec smr_ncmatch(MustNotMatch, Match) -> string_matcher() + when MustNotMatch :: string_matcher(), + Match :: string_matcher(). +% @doc +% Negative conditional match; analogous to `[^abc]` but more flexible +% +% +% /[^abc]/ <-> smr_ncmatch(smr_union([smr_char($a), smr_char($b), smr_char($c)]), +% smr_dot()). +% +% +% string_match({smr_ncmatch, MustNotMatch, Match}, SrcStr) -> +% case string_match(MustNotMatch, SrcStr) of +% no_strmatch -> string_match(Match, SrcStr); +% _ -> no_strmatch +% end. +% +% @end +smr_ncmatch(A, B) -> + {smr_ncmatch, A, B}. + + + +%--------------------------------------------------------- +% string matcher helpers +%--------------------------------------------------------- + +-spec smr_string(Chars) -> string_matcher() + when Chars :: string(). +% @doc +% matches chars given in sequence; basically like putting the string in raw in +% a regex +% +% /foo/ <~> smr_string("foo") +% <~> smr_seq([smr_char($f), smr_char($o), smr_char($o)]) +% +% rewrite over smr_seq/1 and smr_char/1 +% +% smr_string(String) when is_list(String) -> +% smr_seq([smr_char(C) || C <- String]). +% @end +smr_string(String) when is_list(String) -> + smr_seq([smr_char(C) || C <- String]). + + + +-spec smr_oneofchars(Chars) -> UnionMatcher + when Chars :: string(), + UnionMatcher :: string_matcher(). +% @doc +% String matcher for one of chars +% +% /[abc]/ <~> smr_costring("abc") +% <~> smr_union([smr_char($f), smr_char($o), smr_char($o)]) +% +% this is the dual of smr_string/1. string puts chars in sequence, this puts +% chars in parallel. +% +% "costring" nomenclature is chosen specifically to annoy craig +% +% if you fix your stupid url schema i will consider changing this name +% +% the thing is though this is actually a good name, your url schema is just... +% well you know it's compact, so you have amazon beat. no page-long urls for +% gajumarket +% +% you know what, we're keeping both names +% +% i'm confusing myself, renaming to "oneofchars" +% @end +smr_oneofchars(Chars) -> + smr_union([smr_char(C) || C <- Chars]). + + +%%======================================================================= +%% INTERNALS: string matching logic +%%======================================================================= + + +-spec string_match(Matcher, Source) -> MaybeMatch + when Matcher :: string_matcher(), + Source :: string(), + MaybeMatch :: {strmatch, Matched :: string(), Rest :: string()} + | no_strmatch. +% @private +% See if the source matches the given matcher; returns +% +% %% NOTIONAL code +% string_match(/[abc]/, "abc") -> +% {strmatch, "a", "bc"} +% string_match(/[abc]/, "def") -> +% no_strmatch +% @end +string_match({smr_char, C}, SrcStr) -> + case SrcStr of + [X | Rest] when X =:= C -> {strmatch, [C], Rest}; + _ -> no_strmatch + end; +string_match({smr_char_range, X, Y}, Src0) -> + case Src0 of + [C | Src1] when X =< C, C =< Y -> {strmatch, [C], Src1}; + _ -> no_strmatch + end; +string_match({smr_union, SMRs}, Src0) -> + sm_union(SMRs, Src0); +string_match({smr_seq, SMRs}, Src0) -> + sm_seq(SMRs, [], Src0); +string_match({smr_plus, SMR}, Src0) -> + sm_plus(SMR, Src0); +string_match({smr_star, SMR}, Src0) -> + sm_star(SMR, [], Src0); +string_match(smr_dot, SrcStr) -> + case SrcStr of + [C | Rest] -> {strmatch, [C], Rest}; + [] -> no_strmatch + end; +string_match({smr_ncmatch, MustNotMatch, Match}, SrcStr) -> + case string_match(MustNotMatch, SrcStr) of + no_strmatch -> string_match(Match, SrcStr); + _ -> no_strmatch + end. + + +% @private union must match *one* thing +sm_union([SMR | SMRs], Src0) -> + case string_match(SMR, Src0) of + no_strmatch -> sm_union(SMRs, Src0); + Match -> Match + end; +sm_union([], _) -> + no_strmatch. + + +% @private sequence must match *EACH* thing +sm_seq([SMR | SMRs], Acc, Src0) -> + case string_match(SMR, Src0) of + {strmatch, Str, Src1} -> sm_seq(SMRs, [Acc, Str], Src1); + no_strmatch -> no_strmatch + end; +sm_seq([], Acc, Src) -> + {strmatch, unicode:characters_to_list(Acc), Src}. + + +% @private plus matches at least one +sm_plus(SMR, Src0) -> + case string_match(SMR, Src0) of + {strmatch, Str, Src1} -> sm_star(SMR, Str, Src1); + no_strmatch -> no_strmatch + end. + + +% @private star matches 0 or more +sm_star(SMR, Acc, Src0) -> + case string_match(SMR, Src0) of + % 0 + no_strmatch -> + {strmatch, unicode:characters_to_list(Acc), Src0}; + % or more + {strmatch, Str, Src1} -> + sm_star(SMR, [Acc, Str], Src1) + end. + diff --git a/src/gsc_tokens.erl b/src/gsc_tokens.erl new file mode 100644 index 0000000..af8140c --- /dev/null +++ b/src/gsc_tokens.erl @@ -0,0 +1,790 @@ +% @doc +% Ref: so_scan.erl +% +% This file contains a sophia tokenizer written in straightforward erlang with data +% types that are sane. +% +% For MVP it mimics the behavior of so_scan exactly, in terms of like what its +% definition of a token is and so on. +% +% sfc_so_scan.erl contains a compatibility layer that should agree with so_scan +% exactly. It converts the data types here to the shapes that so_scan outputs. +% +% This is for two reasons: +% +% 1. in order to enable testing the two modules against each other, and +% 2. to future-proof in case we decide to incrementally incorporate the sfc +% code into the legacy sophia compiler +% @end +-module(sfc_tokens). + +% meta +-export([ + token_types_parse_order/0, + kwds/0 +]). + +-export([ + take_while/2, + take_while/3, + take_block/1, + take_block_item/1, + strings/2, + slurp_plist/1 +]). + +% token slurping +-export([ + indent_level/1, + is_significant/1, + filter_significant/1, + significant_tokens/1, + tokens/1, + slurp_token/2, + slurp_token_types/3, + slurp_token_of_type/3, + new_pos/2 +]). + +-include("$sfc_include/sfc.hrl"). + + +%======================================================= +% API: functions +%======================================================= + +-spec strings(N, Tokens) -> AtMostNStrings + when N :: non_neg_integer(), + Tokens :: [sfc_token()], + AtMostNStrings :: [string()]. +% @doc return the strings of the first N tokens + +strings(N, [#sfc_token{string = S} | Rest]) when is_integer(N), N >= 1 -> + [S | strings(N-1, Rest)]; +strings(_, []) -> + []; +strings(0, _) -> + []. + + +% used by parser +% +% a block is a column-delimited list of block items +% +% BLOCK = +% foo +% ... +% bar +% ... +% baz +% ... +% +% BLOCK_ITEM = +% foo +% ... + +-spec take_block(Tokens) -> {BlockTokens, Rest} + when Tokens :: [sfc_token()], + BlockTokens :: Tokens, + Rest :: Tokens. +% @doc +% takes all tokens whose column position is >= the column position of +% the head token + +take_block([H = #sfc_token{pos = {_, BlkCol}} | T]) -> + TokenInBlock = + fun(#sfc_token{pos = {_, TkCol}}) -> + BlkCol =< TkCol + end, + take_while(TokenInBlock, [H], T); +take_block([]) -> + {[], []}. + + + +-spec take_block_item(Tokens) -> {ItemTokens, Rest} + when Tokens :: [sfc_token()], + ItemTokens :: Tokens, + Rest :: Tokens. +% @doc +% takes all tokens whose column position is > the column position of +% the head token + +take_block_item([H = #sfc_token{pos = {_, ItemCol}} | T]) -> + TokenInItem = + fun(#sfc_token{pos = {_, TkCol}}) -> + ItemCol < TkCol + end, + take_while(TokenInItem, [H], T); +take_block_item([]) -> + {[], []}. + + + +-spec slurp_plist(Tokens) -> Result + when Tokens :: [Token], + Result :: {slurp, PList :: Tokens, After :: Tokens} + | {error, Mismatch}, + Mismatch :: {fixme, mismatch, OpenStack, ClosedBy}, + OpenStack :: Tokens, + ClosedBy :: none | {value, Token}, + Token :: sfc_token(). + +% @doc +% the verbiage here is `slurp' rather than `take' because we insist on +% delimiter matching. +% +% typical happy path: +% "(foo, bar) baz" ~> {slurp, "(foo, bar)", "baz"} +% "() baz" ~> {slurp, "()", "baz"} +% "foo () baz" ~> {slurp, "", "foo () baz"} +% "(foo, bar) baz" ~> {slurp, "(foo, bar)", "baz"} +% +% typical sad path: +% "(foo, bar]" ~> {mismatch, ["("], {value, "]"}} +% "(foo, bar" ~> {mismatch, ["("], none} +% "([foo, bar)" ~> {mismatch, ["[", "("], {value, ")"}} +% +% counterintuitive: +% "[foo, bar) baz" ~> {slurp, "", "[foo, bar) baz"} +% "~!!\inv4l1d syntax" ~> {slurp, "", "~!!\inv4l1d syntax"} +% "(foo, bar)(baz)" ~> {slurp, "(foo bar)", "(baz)"} +% +% the only "syntax checking" occurring is making sure the delimiter +% stack pushes and pops properly +% +% please note that on mismatch, the list of open delimiters is +% returned in STACK order, meaning the most recent open delimiters +% first. this is more convenient for programs, but might be +% counterintuitive to end-users (who are programmers, entirely +% unfamiliar with notions like stacks and open/close delimiters) + +slurp_plist([Hd = #sfc_token{string = "("} | Tl]) -> + slurp_dlist([Hd], [Hd], Tl); +slurp_plist(Tks) -> + {slurp, [], Tks}. + + +% happy terminal case: stack popped entirely +slurp_dlist(All, [], NewTokens) -> + {slurp, lists:reverse(All), NewTokens}; +% WMA stack is nonempty +% happy cases of opens getting popped +slurp_dlist(All, [#sfc_token{string = "("} | NewOpen], + [#sfc_token{string = ")"} = Tk | NewTks]) -> + slurp_dlist([Tk | All], NewOpen, NewTks); +slurp_dlist(All, [#sfc_token{string = "["} | NewOpen], + [#sfc_token{string = "]"} = Tk | NewTks]) -> + slurp_dlist([Tk | All], NewOpen, NewTks); +slurp_dlist(All, [#sfc_token{string = "{"} | NewOpen], + [#sfc_token{string = "}"} = Tk | NewTks]) -> + slurp_dlist([Tk | All], NewOpen, NewTks); +% happy: open delimiters getting pushed +slurp_dlist(All, Opens, [#sfc_token{string = "("} = Tk | NewTks]) -> + slurp_dlist([Tk | All], [Tk | Opens], NewTks); +slurp_dlist(All, Opens, [#sfc_token{string = "["} = Tk | NewTks]) -> + slurp_dlist([Tk | All], [Tk | Opens], NewTks); +slurp_dlist(All, Opens, [#sfc_token{string = "{"} = Tk | NewTks]) -> + slurp_dlist([Tk | All], [Tk | Opens], NewTks); +% sad: mismatch cases +slurp_dlist(All, Opens, []) -> + {error, {fixme, mismatch, Opens, none}}; +slurp_dlist(All, Opens, [#sfc_token{string = "}"} = BadClose | _]) -> + {error, {fixme, mismatch, Opens, {value, BadClose}}}; +slurp_dlist(All, Opens, [#sfc_token{string = "]"} = BadClose | _]) -> + {error, {fixme, mismatch, Opens, {value, BadClose}}}; +slurp_dlist(All, Opens, [#sfc_token{string = ")"} = BadClose | _]) -> + {error, {fixme, mismatch, Opens, {value, BadClose}}}; +% general case: non-terminal token gets pushed +slurp_dlist(All, Opens, [Tk | NewTks]) -> + slurp_dlist([Tk | All], Opens, NewTks). + + +%------------------------------------------------------- +% API: meta info +% +% This is parse order definition, list of keywords, etc +% +% -export([ +% token_types_parse_order/0, +% kwds/0 +% ]). +%------------------------------------------------------- + +-spec token_types_parse_order() -> [sfc_token_type()]. +% @doc +% list of sophia tokens in parse order (if an earlier type matches, the later +% type isn't even checked) +% +% +% Rules = +% %% Comments and whitespace +% [ CommentStart +% , {"//.*", skip()} +% , {WS, skip()} +% +% %% Special characters +% , {"\\.\\.|[,.;()\\[\\]{}]", symbol()} +% +% %% Literals +% , {CHAR, token(char, fun parse_char/1)} +% , {STRING, token(string, fun parse_string/1)} +% , {HEX, token(hex, fun parse_hex/1)} +% , {INT, token(int, fun parse_int/1)} +% , {BYTES, token(bytes, fun parse_bytes/1)} +% +% %% Identifiers (qualified first!) +% , {QID, token(qid, fun(S) -> string:tokens(S, ".") end)} +% , {QCON, token(qcon, fun(S) -> string:tokens(S, ".") end)} +% , {TVAR, token(tvar)} +% , override({ID, token(id)}, {KW, symbol()}) %% Keywords override identifiers. Need to +% , {CON, token(con)} %% use override to avoid lexing "lettuce" +% %% as ['let', {id, "tuce"}]. +% %% Operators +% , {OP, symbol()} +% ], +% @end + +token_types_parse_order() -> + % written in this style to be maximally editable + lists:flatten([ + % comments and whitespace + lcom, bcom, ws, + punct, + % literals + char, string, int16, int10, bytes, + ak, ct, sg, + % qualified names need to go ahead of unqualifieds + qid, qcon, + tvar, + % keywords need to be parsed ahead of ids + kwd, id, + con, + % ops [=, =>, >>], punctuation (parens/braces) + op + ]). + + +-spec kwds() -> list(string()). +% @doc list of sophia kwds + +kwds() -> + ["contract", "include", "let", "switch", "type", "record", "datatype", + "if", "elif", "else", "function", "stateful", "payable", "true", "false", + "mod", "public", "entrypoint", "private", "indexed", "namespace", + "interface", "main", "using", "as", "for", "hiding", "band", "bor", + "bxor", "bnot"]. + + +%------------------------------------------------------- +% API: token slurping +% +% -export([ +% tokens/1, +% slurp_token/1, +% slurp_token_types/2, +% slurp_token_of_type/2 +% ]). +%------------------------------------------------------- + +% Token accessors +-spec indent_level(sfc_token()) -> pos_integer(). + +indent_level(#sfc_token{pos = {_, IndentLevel}}) -> + IndentLevel. + + + +-spec significant_tokens(SrcStr) -> Result + when SrcStr :: iolist(), + Result :: {ok, Tokens} + | {error, sfc_err()}, + Tokens :: [sfc_token()]. + +significant_tokens(SrcStr) -> + case tokens(SrcStr) of + {ok, Tokens} -> + {ok, filter_significant(Tokens)}; + Error -> + Error + end. + + + +-spec filter_significant(Tokens) -> SignificantTokens + when Tokens :: [sfc_token()], + SignificantTokens :: Tokens. + +filter_significant(Tokens) -> + lists:filter(fun is_significant/1, Tokens). + + + +-spec is_significant(Token) -> boolean() + when Token :: sfc_token(). + +is_significant(#sfc_token{type = bcom}) -> false; +is_significant(#sfc_token{type = lcom}) -> false; +is_significant(#sfc_token{type = ws}) -> false; +is_significant(_) -> true. + + +-spec tokens(SrcStr) -> Result + when SrcStr :: iolist(), + Result :: {ok, Tokens} + | {error, sfc_err()}, + Tokens :: [sfc_token()]. +% @doc +% Recursively parse all tokens off the front end of the string. `Rest' is +% the first tail of the string for which no token parser succeeds. +% +% Semantically, `Rest'` being nonempty amounts to the presence of an illegal +% character. + +tokens(S) -> + % defensive normalization + tokens([], {1, 1}, unicode:characters_to_nfc_list(S)). + +tokens(Stack, _FinalPos, "") -> + {ok, lists:reverse(Stack)}; +tokens(Stack, Pos, SrcStr) -> + case slurp_token(Pos, SrcStr) of + {tokmatch, NewToken = #sfc_token{string = TokStr}, + NewSrcStr} -> + NewPos = new_pos(Pos, TokStr), + tokens([NewToken | Stack], NewPos, NewSrcStr); + no_tokmatch -> + PrevTokens = lists:reverse(Stack), + Err = #sfc_err_no_tokmatch{prev_tokens = PrevTokens, + break_pos = Pos, + rest = SrcStr}, + {error, Err}; + % FIXME so_scan bad + % this is so fucking stupid + % so_scan for some reason allows unterminated block comments at + % the end of files + % + % for now we're just going to agree with so_scan + {ierr, unterminated_block_comment} -> + PrevTokens = lists:reverse(Stack), + Err = #sfc_err_bcom_unterminated{prev_tokens = PrevTokens, + break_pos = Pos, + rest = SrcStr}, + {error, Err}; + Error = {error, _} -> + Error + end. + +% alright some bullshit here +% +% we're computing the line/column position of each string +% +% however this is meant to be compatible with so_scan, so it's a bit wonky +% because regex list bullshit. +% +% recall that so_scan operates on the list representation of the utf-8 encoded +% bytes; this is different than on a list of bignum codepoints (e.g. +% unicode:characters_to_nfc_list(Bytes)); let's suppose some stupid complicated +% foreign character which a sane language would simply criminalize has list +% representation [ABC], but byte representation <> +% +% as far as so_scan is concerned, this means the character ABC consumes 3 +% columns. the only exception is tab characters, which always fast-forward to +% the next tab stop, which is 1-indexed because god hates all of us +% +% so the tab-stops are +% 1 9 17 25 33 ... +% +% column position is determined in all cases by byte order, EXCEPT for $\t +% which goes to the next tab stop +% +% so in general, for the token string, we need to convert to bytes first, +% then handle `\t` bytes as a special case +% +% again in the tokenizer context, we're assuming that the input to our +% tokenizer is an nfc-list which has a flat list of each unicode character in +% codepoint form +% +% here we're just converting it to byte form, then computing columns based on +% bytes +new_pos(OldPos, TokStr) -> + new_pos_bytes(unicode:characters_to_binary(TokStr), OldPos). + +% newline just goes to {L+1, 1} +new_pos_bytes(<<$\n:8, Rest/bytes>>, _Pos = {L, _}) -> + NewPos = {L+1, 1}, + new_pos_bytes(Rest, NewPos); +new_pos_bytes(<<$\t:8, Rest/bytes>>, _Pos = {Linum, Colnum1}) -> + % stinky wet sweaty robots need 1-based indexing + % so tab stops are at + % 1 9 17 25 + % super awesome dry silicon robots use 0-based indexing + % so tab stops are at + % 0 8 16 25 + Colnum0 = Colnum1 - 1, + % 0 based is based + NextTabstop0 = next_tabstop8(Colnum0), + NextTabstop1 = NextTabstop0 + 1, + NextPos = {Linum, NextTabstop1}, + new_pos_bytes(Rest, NextPos); +new_pos_bytes(<<_:8, Rest/bytes>>, _Pos = {Linum, Colnum1}) -> + % in general advance by 1 + new_pos_bytes(Rest, {Linum, Colnum1 + 1}); +new_pos_bytes(<<>>, FinalPos) -> + FinalPos. + +% 0 8 16 24 etc +% 0*8 1*8 2*8 3*8 etc +next_tabstop8(Col0) when Col0 >= 0 -> + % Col0 = PrevTabQ*8 + PrevTabR + PrevTabQ = Col0 div 8, + PrevTabR = Col0 rem 8, + Col0 = PrevTabQ*8 + PrevTabR, + NextTabQ = PrevTabQ + 1, + NextTabCol0 = NextTabQ*8, + NextTabCol0. + +%% copied from so_scan_lib.erl just to match behavior +%-define(TAB_SIZE, 8). +% +%next_pos([], P) -> P; +%next_pos([$\n | S], {L, _}) -> next_pos(S, {L + 1, 1}); +%next_pos([$\t | S], {L, C}) -> next_pos(S, {L, (C + ?TAB_SIZE - 1) div ?TAB_SIZE * ?TAB_SIZE + 1}); +%next_pos([_ | S], {L, C}) -> next_pos(S, {L, C + 1}). + + + +-spec slurp_token(Pos, SrcStr) -> Result + when Pos :: sfc_pos(), + SrcStr :: string(), + Result :: {tokmatch, Token, Rest} + | no_tokmatch + | {error, sfc_err()} + | {ierr, unterminated_block_comment}, + Token :: sfc_token(), + Rest :: string(). +% @doc +% grab a single token off the front of the string according to +% `token_types_parse_order/0' + +slurp_token(Pos, SrcStr) -> + % this is the easiest format if i need to fuck with it + slurp_token_types(token_types_parse_order(), Pos, SrcStr). + + + +-spec slurp_token_types(ParseOrder, Pos, SrcStr) -> Result + when ParseOrder :: [sfc_token_type()], + Pos :: sfc_pos(), + SrcStr :: string(), + Result :: {tokmatch, Token, Rest} + | no_tokmatch + | {error, sfc_err()} + | {ierr, unterminated_block_comment}, + Token :: sfc_token(), + Rest :: string(). +% @doc +% grab a single token off the front of the string according to +% `token_types_parse_order/0' + +slurp_token_types([TokenType | TTs], Pos, SrcStr) -> + case slurp_token_of_type(TokenType, Pos, SrcStr) of + Match = {tokmatch, _, _} -> Match; + no_tokmatch -> slurp_token_types(TTs, Pos, SrcStr); + IErr = {ierr, _} -> IErr; + Error = {error, _} -> Error + end; +slurp_token_types([], _Pos, _SrcStr) -> + no_tokmatch. + + +-spec slurp_token_of_type(TokenType, Pos, SrcStr) -> MaybeToken + when TokenType :: sfc_token_type(), + Pos :: sfc_pos(), + SrcStr :: string(), + MaybeToken :: {tokmatch, Token, Rest} + | no_tokmatch + | {error, sfc_err()} + | {ierr, unterminated_block_comment}, + Token :: sfc_token(), + Rest :: string(). +% @doc +% match a sophia token of a given type off the front of the string +% @end + +% COMMENTS AND WHITESPACE: lcom, bcom, ws +% +% sophia line comment +% +% i am not going to bother writing a string matcher thing for this +% FIXME: make a string matcher for line comments +slurp_token_of_type(lcom, Pos, SrcStr) -> + case SrcStr of + "//" ++ _ -> + {Line, Rest} = takeline("", SrcStr), + Token = #sfc_token{type = lcom, + pos = Pos, + string = Line}, + {tokmatch, Token, Rest}; + _ -> + no_tokmatch + end; +% Block comments cannot have a string matcher because they have a whole stack +% thing keeping track of depth because of nested block comments +slurp_token_of_type(bcom, Pos, SrcStr0) -> + case SrcStr0 of + "/*" ++ SrcStr1 -> + case bcom("/*", 1, SrcStr1) of + {ok, CommentStr, SrcStr2} -> + Token = #sfc_token{type = bcom, + pos = Pos, + string = CommentStr}, + {tokmatch, Token, SrcStr2}; + Error -> + Error + end; + _ -> + no_tokmatch + end; +slurp_token_of_type(ws, Pos, SrcStr) -> + WhitespaceMatcher = sfc_strmatch:smr_sf_ws(), + case sfc_strmatch:match(WhitespaceMatcher, SrcStr) of + no_strmatch -> + no_tokmatch; + {strmatch, WS, Rest} -> + Token = #sfc_token{type = ws, + pos = Pos, + string = WS}, + {tokmatch, Token, Rest} + end; +% KEYWORDS, OPERATORS, PUNCTUATION: kwd, op, punct +% +% all the kwds are valid ids, so we match as an id and then check if it's a +% kwd +% +% kwds are allowed to be prefixes for user-defined variable names; e.g. +% "lettuce" should be parsed as an id, not as ["let", "tuce"]; for this reason +% we need to be careful with greedily parsing kwds +% +% we know kwds are always ids, so we parse it as an id and see if it's one +% of the kwds +slurp_token_of_type(kwd, Pos, SrcStr) -> + case slurp_token_of_type(id, Pos, SrcStr) of + {tokmatch, IdTok = #sfc_token{string = IdStr}, Rest} -> + case lists:member(IdStr, kwds()) of + false -> + no_tokmatch; + true -> + KwTok = IdTok#sfc_token{type = kwd}, + {tokmatch, KwTok, Rest} + end; + no_tokmatch -> + no_tokmatch + end; +slurp_token_of_type(op, Pos, SrcStr) -> + case sfc_strmatch:match(sfc_strmatch:smr_sf_op(), SrcStr) of + {strmatch, Str, Rest} -> + Token = #sfc_token{type = op, pos = Pos, string = Str}, + {tokmatch, Token, Rest}; + no_strmatch -> + no_tokmatch + end; +slurp_token_of_type(punct, Pos, SrcStr) -> + case sfc_strmatch:match(sfc_strmatch:smr_sf_punct(), SrcStr) of + {strmatch, Str, Rest} -> + Token = #sfc_token{type = punct, pos = Pos, string = Str}, + {tokmatch, Token, Rest}; + no_strmatch -> + no_tokmatch + end; +% SOPHIA VARIABLE NAMES: id, con, qid, qcon, tvar +slurp_token_of_type(id, Pos, SrcStr) -> + case sfc_strmatch:match(sfc_strmatch:smr_sf_id(), SrcStr) of + {strmatch, IdStr, Rest} -> + Token = #sfc_token{type = id, pos = Pos, string = IdStr}, + {tokmatch, Token, Rest}; + no_strmatch -> + no_tokmatch + end; +slurp_token_of_type(con, Pos, SrcStr) -> + case sfc_strmatch:match(sfc_strmatch:smr_sf_con(), SrcStr) of + {strmatch, Str, Rest} -> + Token = #sfc_token{type = con, pos = Pos, string = Str}, + {tokmatch, Token, Rest}; + no_strmatch -> + no_tokmatch + end; +slurp_token_of_type(qid, Pos, SrcStr) -> + case sfc_strmatch:match(sfc_strmatch:smr_sf_qid(), SrcStr) of + {strmatch, Str, Rest} -> + Token = #sfc_token{type = qid, pos = Pos, string = Str}, + {tokmatch, Token, Rest}; + no_strmatch -> + no_tokmatch + end; +slurp_token_of_type(qcon, Pos, SrcStr) -> + case sfc_strmatch:match(sfc_strmatch:smr_sf_qcon(), SrcStr) of + {strmatch, Str, Rest} -> + Token = #sfc_token{type = qcon, pos = Pos, string = Str}, + {tokmatch, Token, Rest}; + no_strmatch -> + no_tokmatch + end; +slurp_token_of_type(tvar, Pos, SrcStr) -> + case sfc_strmatch:match(sfc_strmatch:smr_sf_tvar(), SrcStr) of + {strmatch, Str, Rest} -> + Token = #sfc_token{type = tvar, pos = Pos, string = Str}, + {tokmatch, Token, Rest}; + no_strmatch -> + no_tokmatch + end; +slurp_token_of_type(int16, Pos, SrcStr) -> + case sfc_strmatch:match(sfc_strmatch:smr_sf_int16(), SrcStr) of + {strmatch, Str, Rest} -> + Token = #sfc_token{type = int16, pos = Pos, string = Str}, + {tokmatch, Token, Rest}; + no_strmatch -> + no_tokmatch + end; +slurp_token_of_type(int10, Pos, SrcStr) -> + case sfc_strmatch:match(sfc_strmatch:smr_sf_int10(), SrcStr) of + {strmatch, Str, Rest} -> + Token = #sfc_token{type = int10, pos = Pos, string = Str}, + {tokmatch, Token, Rest}; + no_strmatch -> + no_tokmatch + end; +% LITERAL PARSERS: char, string, hex, int, bytes10, bytes16, +% ak, ct, sg +% +% char: sophia char literal +slurp_token_of_type(ak, Pos, SrcStr) -> + StringMatcher = sfc_strmatch:smr_sf_ak(), + case sfc_strmatch:match(StringMatcher, SrcStr) of + no_strmatch -> + no_tokmatch; + {strmatch, TokenStr, Rest} -> + Token = #sfc_token{type = ak, pos = Pos, string = TokenStr}, + {tokmatch, Token, Rest} + end; +slurp_token_of_type(ct, Pos, SrcStr) -> + StringMatcher = sfc_strmatch:smr_sf_ct(), + case sfc_strmatch:match(StringMatcher, SrcStr) of + no_strmatch -> + no_tokmatch; + {strmatch, TokenStr, Rest} -> + Token = #sfc_token{type = ct, pos = Pos, string = TokenStr}, + {tokmatch, Token, Rest} + end; +slurp_token_of_type(sg, Pos, SrcStr) -> + StringMatcher = sfc_strmatch:smr_sf_sg(), + case sfc_strmatch:match(StringMatcher, SrcStr) of + no_strmatch -> + no_tokmatch; + {strmatch, TokenStr, Rest} -> + Token = #sfc_token{type = sg, pos = Pos, string = TokenStr}, + {tokmatch, Token, Rest} + end; +slurp_token_of_type(char, Pos, SrcStr) -> + StringMatcher = sfc_strmatch:smr_sf_char(), + case sfc_strmatch:match(StringMatcher, SrcStr) of + no_strmatch -> + no_tokmatch; + {strmatch, TokenStr, Rest} -> + Token = #sfc_token{type = char, pos = Pos, string = TokenStr}, + {tokmatch, Token, Rest} + end; +slurp_token_of_type(string, Pos, SrcStr) -> + case sfc_strmatch:match(sfc_strmatch:smr_sf_str(), SrcStr) of + no_strmatch -> + no_tokmatch; + {strmatch, TokenStr, Rest} -> + Token = #sfc_token{type = string, pos = Pos, string = TokenStr}, + {tokmatch, Token, Rest} + end; +slurp_token_of_type(bytes, Pos, SrcStr) -> + case sfc_strmatch:match(sfc_strmatch:smr_sf_bytes(), SrcStr) of + no_strmatch -> + no_tokmatch; + {strmatch, TokenStr, Rest} -> + Token = #sfc_token{type = bytes, pos = Pos, string = TokenStr}, + {tokmatch, Token, Rest} + end; +slurp_token_of_type(NyiType, Pos, SrcStr) -> + Message = io_lib:format("cannot slurp token of type: ~p", [NyiType]), + error(#sfc_err{atom = nyi, + string = Message, + extra = [{token_type, NyiType}, + {pos, Pos}, + {rest, SrcStr}]}). + + + +takeline(Acc, "") -> {lists:reverse(Acc), ""}; +takeline(Acc, Rest = "\n" ++ _) -> {lists:reverse(Acc), Rest}; +takeline(Acc, [C | Rest]) -> takeline([C | Acc], Rest). + + +bcom(CommentStr, Depth, SrcStr0) when Depth > 0 -> + case SrcStr0 of + % premature end + "" -> + {ierr, unterminated_block_comment}; + % decrease depth + "*/" ++ SrcStr1 -> + NewCommentStr = [CommentStr, "*/"], + NewDepth = Depth - 1, + bcom(NewCommentStr, NewDepth, SrcStr1); + % increase depth + "/*" ++ SrcStr1 -> + NewCommentStr = [CommentStr, "/*"], + NewDepth = Depth + 1, + bcom(NewCommentStr, NewDepth, SrcStr1); + % same depth, add to list + [C | SrcStr1] -> + NewCommentStr = [CommentStr, C], + bcom(NewCommentStr, Depth, SrcStr1) + end; +bcom(CommentStr, 0, SrcStr) -> + {ok, unicode:characters_to_nfc_list(CommentStr), SrcStr}. + + +%------------------------------------------ +% INTERNAL UTILITIES +%------------------------------------------ + +-spec take_while(Pred, List) -> {Taken, Rest} + when Pred :: fun((Item) -> boolean()), + List :: [Item], + Taken :: List, + Rest :: List. +% @doc similar to lists:takewhile but returns {Taken, Rest}. Name is +% to remind you it returns 2 things. + +take_while(Pred, List) -> + take_while(Pred, [], List). + + +-spec take_while(Pred, Prefix, List) -> {Taken, Rest} + when Pred :: fun((Item) -> boolean()), + Prefix :: List, + List :: [Item], + Taken :: List, + Rest :: List. +% @doc +% similar to takewhile_ii/2, but returns {Prefix ++ Taken, Rest} +% +% where Prefix +% +% middle argument is just the accum +take_while(Pred, Pfx, List) -> + tw3(Pred, lists:reverse(Pfx), List). + + +tw3(Pred, Stk, [X | Xs]) -> + case Pred(X) of + true -> tw3(Pred, [X | Stk], Xs); + false -> {lists:reverse(Stk), [X | Xs]} + end; +tw3(_, Stk, []) -> + {lists:reverse(Stk), []}.