moving sfc over here

2026-06-01 16:44:33 -07:00
parent 177978d184
commit f548c7d88d
15 changed files with 3534 additions and 19 deletions
@@ -0,0 +1,45 @@
+#!/usr/bin/env bash
+
+# Bash harness for GSC command line tool
+#
+# gsc foo bar baz
+# -> zx rundir \
+#       /home/pharpend/src/ioecs/gsc/cli \
+#       --libs=gsc:/home/pharpend/src/ioecs/gsc \
+#       foo bar baz
+
+zx_exists() {
+    command -v zx > /dev/null
+}
+
+
+if ! zx_exists ; then
+    echo "zx not found in \$PATH"
+    echo ""
+    echo "Please install Erlang and zx:"
+    echo "https://git.qpq.swiss/QPQ-AG/public-wiki/wiki/Install-Erlang-and-zx"
+    echo ""
+    echo "PATH=${PATH}"
+    exit 1
+fi
+
+# script location and parent dir
+FP_THIS_FILE=$(readlink -f "${BASH_SOURCE[0]}")
+FP_THIS_DIR=$(dirname -- "${FP_THIS_FILE}")
+FP_PRJ_DIR=$(dirname -- "${FP_THIS_DIR}")
+FP_CLI_DIR="${FP_PRJ_DIR}/cli"
+
+# # compute libs string
+# LIB_PARTS=()
+# for depname in "${LOCAL_DEPS[@]}"; do
+#     LIB_PARTS+=("${depname}:${PRJ_DIR}/${depname}")
+# done
+# IFS=,
+# LOCAL_LIBS="${LIB_PARTS[*]}"
+# unset IFS
+
+LOCAL_LIBS="gsc:${FP_PRJ_DIR}"
+
+#echo "zx rundir $FP_CLI_DIR --libs=$LOCAL_LIBS $@"
+
+zx rundir $FP_CLI_DIR --libs=$LOCAL_LIBS $@
@@ -0,0 +1,15 @@
+.eunit
+deps
+*.o
+*.beam
+*.plt
+*.swp
+erl_crash.dump
+ebin/*.beam
+doc/*.html
+doc/*.css
+doc/edoc-info
+doc/erlang.png
+rel/example_project
+.concrete/DEV_MODE
+.rebar
@@ -0,0 +1 @@
+{"src/*",  [debug_info, {i, "include/"}, {outdir, "ebin/"}]}.
@@ -0,0 +1,7 @@
+{application,gsc_cli,
+             [{description,"GSC CLI and test suite"},
+              {registered,[]},
+              {included_applications,[]},
+              {applications,[stdlib,kernel]},
+              {vsn,"0.1.0"},
+              {modules,[gsc_cli]}]}.
@@ -0,0 +1,25 @@
+%%% @doc
+%%% GSC CLI: gsc_cli
+%%%
+%%% This module is currently named `gsc_cli', but you may want to change that.
+%%% Remember that changing the name in `-module()' below requires renaming
+%%% this file, and it is recommended to run `zx update .app` in the main
+%%% project directory to make sure the ebin/gsc_cli.app file stays in
+%%% sync with the project whenever you add, remove or rename a module.
+%%% @end
+
+-module(gsc_cli).
+-vsn("0.1.0").
+-author("Peter Harpending <peterharpending@qpq.swiss>").
+-copyright("Peter Harpending <peterharpending@qpq.swiss>").
+-license("GPL-3.0-only").
+
+-export([start/1]).
+
+
+-spec start(ArgV) -> ok
+    when ArgV :: [string()].
+
+start(ArgV) ->
+    ok = io:format("Hello, World! Args: ~tp~n", [ArgV]),
+    zx:silent_stop().
@@ -0,0 +1,18 @@
+{name,"GSC CLI"}.
+{type,cli}.
+{modules,[]}.
+{mod,"gsc_cli"}.
+{prefix,none}.
+{author,"Peter Harpending"}.
+{desc,"GSC CLI and test suite"}.
+{package_id,{"otpr","gsc_cli",{0,1,0}}}.
+{deps,[{"otpr","gsc",{0,1,0}}]}.
+{key_name,none}.
+{a_email,"peterharpending@qpq.swiss"}.
+{c_email,"peterharpending@qpq.swiss"}.
+{copyright,"Peter Harpending"}.
+{file_exts,[]}.
+{license,"GPL-3.0-only"}.
+{repo_url,[]}.
+{tags,[]}.
+{ws_url,[]}.
@@ -0,0 +1,161 @@
+% This is a header file that contains sfc's record types
+%
+% This is in order to
+%   1. share records across modules; and,
+%   2. allow external modules to just use the sfc records
+
+%-------------------------------------------------------
+% API Types: sfc internal token representation
+%
+% -export_type([
+%     sf_token_type/0
+%     sf_token/0
+% ]).
+%-------------------------------------------------------
+
+% 
+-type sfc_token_type()
+    :: bcom      % /* ... */
+     | lcom      % //
+     | ws        % whitespace
+     % literals
+     | char      % 'a'
+     | string    % "foo"
+     | int10     % 69_420
+     | int16     % 0xDEAD_BEEF
+     | bytes     % #DEAD_BEEF
+     | ak        % ak_ABC
+     | ct        % ct_ABC
+     | sg        % sg_ABC
+     % kwds/variables/etc
+     | id         % foo, foo_bar, foo_bar'baz'  _'foo'
+     | con        % Foo, Foo_Bar, FooBar
+     | qid        % Foo.Bar.baz
+     | qcon       % Foo.Bar.Baz
+     | tvar       % 'foo, 'foo_bar, '_'foo'_'bar'''
+     % kwds ops and punct are all collapsed by
+     % so_scan:scan down to eg {'contract', {420, 69}}
+     % where {420, 69} is the source location
+     % these are three different parsers
+     | kwd        % contract, interface, payable, etc
+     | op         % "=!<>+-*/:&|?~@^"
+     | punct      % ".." | oneof(",.;()[]{}")
+     % kwds and punct are kind of the same thing
+     % but i'll keep them separate now for my own sanity. ok
+     % i guess op or symbol or whatever is fine.
+     %
+     % not going to overthink. if having them separate
+     % becomes an issue it's easy enough to collapse. harder
+     % to separate afterward if collapsing is wrong.
+     .
+
+
+-type sfc_pos() :: {Line :: pos_integer(), Col :: pos_integer()}.
+
+-record(sfc_token,
+        {type   :: sfc_token_type(),
+         pos    :: sfc_pos(),
+         string :: string()}).
+
+-type sfc_token() :: #sfc_token{}.
+
+
+% tokens are in essence the "chunk boundaries" of
+% the file
+%
+% because we have semantic whitespace, we have to be
+% careful about block declarations, because we don't
+% have an explicit open/close block token. blocks can
+% be closed in one of two ways:
+%
+%   1. a new block at a previous indent level:
+%           switch(foo)
+%               // block starts here
+%               Bar => bar()
+%               Baz => baz()
+%           quux()      // ends because indent level
+%   2. it's part of some type of list:
+%
+%           [switch(foo)
+%               // block starts here
+%               Bar => bar()
+%               Baz => baz(),   // ends here
+%            switch(bizz)
+%               // block starts here
+%               Bar => bar()
+%               Baz => baz()]
+%
+% in order to avoid speculatively inserting virtual
+% close tokens, at least on first write-out, we're
+% going to disambiguate list notions right away
+
+% token groups
+% lists = (_, _, _)
+%       | [_, _, _]
+%       | {_, _, _}
+%-record(sfc_ast1_block,
+%        {indent = none :: none | pos_integer(),
+%         decls  = none :: [sfc_ast1_decl()]}).
+%
+%-type sfc_ast() :: 
+%
+%-type sfc_list_group() :: {'(', [sfc_token()], ')'}
+%                        | {'[', [sfc_token()], ']'}
+%                        | {'{', [sfc_token()], '}'}
+%                        | {proof, 
+%                        .
+
+
+
+% @doc
+% this one is very specific so it deserves its own
+% record type: unterminated block comments at the end
+% of files. these are ok in legacy sophia, so we have to
+% specifically account for this error
+-record(sfc_err_bcom_unterminated,
+        {prev_tokens :: [sfc_token()],
+         break_pos    :: sfc_pos(),
+         rest         :: string()}).
+
+-record(sfc_err_no_tokmatch,
+        {prev_tokens :: [sfc_token()],
+         break_pos    :: sfc_pos(),
+         rest         :: string()}).
+
+
+-record(sfc_err_delims,
+        {past       :: [sfc_token()],
+         open_stack :: [sfc_token()],
+         bad_close  :: sfc_token(),
+         future     :: [sfc_token()]}).
+% FIXME
+-record(sfc_err_nyi, {}).
+-record(sfc_err_empty_file, {}).
+
+%-record(src_parse_error,
+%        {atom   = none :: none | atom(),
+%         string = 
+
+%j-record(sfc_err_gulp_ct,
+%j        {gulped :: 
+
+% @doc
+% generic placeholder error for now
+-record(sfc_err,
+        {atom           :: atom(),
+         string = none  :: none | iolist(),
+         extra  = none  :: none | any()}).
+
+% @doc all errors SFC can return conveniently listed in
+% one place
+-type sfc_err() :: #sfc_err_bcom_unterminated{}
+                 | #sfc_err_no_tokmatch{}
+                 | #sfc_err_nyi{}
+                 | #sfc_err_empty_file{}
+                 | #sfc_err{}.
+
+
+%% FIXME
+-type sfc_ast() :: any().
+
+
@@ -1,23 +1,144 @@
-%%% @doc
-%%% Gajumaru Sophia Compiler: gsc
-%%%
-%%% This module is currently named `gsc', but you may want to change that.
-%%% Remember that changing the name in `-module()' below requires renaming
-%%% this file, and it is recommended to run `zx update .app` in the main
-%%% project directory to make sure the ebin/gsc.app file stays in
-%%% sync with the project whenever you add, remove or rename a module.
-%%% @end
+% @doc bikeshed proctrastination head into vim warmup thing
+% sophia compiler from scratch by PRH
+%
+% based on original sophia compiler
+%
+% parse layers:
+% 1. sfc_tokenizer: SrcStr -> (Tokens | SigTokens)
+%
+%    SigTokens = not comment/whitespace
+%
+%    layers:
+%    a. sfc_strmatch : matches string shapes
+%    b. sfc_so_scan  : converts to so_scan shapes
+%
+% 2. sfc_ast: SigTokens -> AST
+%
+% terminology:
+%
+%  - `slurp`/`barf` borrowed from emacs paredit mode:
+%
+%     slurp : (a b) c -> (a b c)
+%     barf  : (a b c) -> a (b c)
+%
+%     * `slurp` usually involves *transforming* input
+%       into a new type (e.g. slurp a token from src
+%       string); think of slurp as a verb meaning to
+%       consume and then digest
+%     * `barf` basically means blindly splitting off
+%       input
+%
+% @end

-module(gsc).
-vsn("0.1.0").
-author("Peter Harpending <peterharpending@qpq.swiss>").
-copyright("Peter Harpending <peterharpending@qpq.swiss>").
-license("GPL-3.0-only").
-
-export([hello/0]).
+% TODO:
+%   - barf for outputs, slurp for inputs
+%   - architecture needs more careful thought but only after something works
+%   - too fuzzy right now
+%   - possibly:
+%       - rename parser layers sequentially:
+%       - sfc_
+-module(sfc).


-spec hello() -> ok.
+-export_type([
+    token/0
+]).

-hello() ->
-    io:format("~p (~p) says \"Hello!\"~n", [self(), ?MODULE]).
+-export([
+    sigtokens_from_file/1,
+    sigtokens_from_string/1,
+    tokens_from_file/1,
+    tokens_from_string/1,
+    ast_from_file/1,
+    ast_from_string/1,
+    ast_from_tokens/1
+]).
+
+-include("$sfc_include/sfc.hrl").
+
+%-----------------------------------------
+% types
+%-----------------------------------------
+
+-type token() :: sfc_token().
+
+%-----------------------------------------
+% functions
+%-----------------------------------------
+
+sigtokens_from_file(X) ->
+    case tokens_from_file(X) of
+        {ok, Y} -> {ok, sfc_tokens:filter_significant(Y)};
+        Err     -> Err
+    end.
+
+sigtokens_from_string(X) ->
+    case tokens_from_string(X) of
+        {ok, Y} -> {ok, sfc_tokens:filter_significant(Y)};
+        Err     -> Err
+    end.
+
+
+-spec tokens_from_file(FilePath) -> Perhaps
+    when FilePath :: string(),
+         Perhaps  :: {ok, Tokens}
+                   | {error, sfc_err() | any()},
+         Tokens   :: [sfc_token()].
+
+tokens_from_file(FilePath) ->
+    case file:read_file(FilePath) of
+        {ok, FBytes} -> tokens_from_string(FBytes);
+        Error        -> Error
+    end.
+
+
+
+
+-spec tokens_from_string(SrcStr) -> Result
+    when SrcStr   :: string(),
+         Result   :: {ok, Tokens}
+                   | {error, sfc_err()},
+         Tokens   :: [sfc_token()].
+
+tokens_from_string(SrcStr) ->
+    sfc_tokens:tokens(SrcStr).
+
+
+
+-spec ast_from_file(FilePath) -> Perhaps
+    when FilePath :: string(),
+         Perhaps  :: {ok, AST} | {error, sfc_err()},
+         AST      :: sfc_ast().
+
+ast_from_file(FilePath) ->
+    case file:read_file(FilePath) of
+        {ok, FileBytes} -> ast_from_string(FileBytes);
+        Error           -> Error
+    end.
+
+
+
+-spec ast_from_string(SrcStr) -> Perhaps
+    when SrcStr   :: string(),
+         Perhaps  :: {ok, AST} | {error, sfc_err()},
+         AST      :: sfc_ast().
+
+ast_from_string(SrcStr) ->
+    case sfc_tokens:significant_tokens(SrcStr) of
+        {ok, SigTks} -> ast_from_tokens(SigTks); 
+        Error        -> Error
+    end.
+
+
+
+-spec ast_from_tokens(SrcTokens) -> Perhaps
+    when SrcTokens :: [sfc_token()],
+         Perhaps   :: {ok, AST} | {error, sfc_err()},
+         AST       :: sfc_ast().
+
+ast_from_tokens(Tks) ->
+    SigTks = sfc_tokens:filter_significant(Tks),
+    case sfc_ast:gulp_file(SigTks) of
+        {gulp, AST} -> {ok, AST};
+        Error       -> Error
+    end.
@@ -0,0 +1,693 @@
+% @doc
+% type for an SFC AST
+%
+% based on so_syntax.erl
+%
+% this is the second layer in the parsing pipeline after tokens. From
+% so_syntax:
+%
+% ## Declarations
+%
+% A Sophia file consists of a sequence of *declarations* in a layout block.
+%
+% ```c
+% File ::= Block(TopDecl)
+%
+% TopDecl ::= ['payable'] ['main'] 'contract' Con [Implement] '=' Block(Decl)
+%           | 'contract' 'interface' Con [Implement] '=' Block(Decl)
+%           | 'namespace' Con '=' Block(Decl)
+%           | '@compiler' PragmaOp Version
+%           | 'include' String
+%           | Using
+%
+% Implement ::= ':' Sep1(Con, ',')
+%
+% Decl ::= 'type'     Id ['(' TVar* ')'] '=' TypeAlias
+%        | 'record'   Id ['(' TVar* ')'] '=' RecordType
+%        | 'datatype' Id ['(' TVar* ')'] '=' DataType
+%        | 'let'      Id [':' Type]      '=' Expr
+%        | (EModifier* 'entrypoint' | FModifier* 'function') Block(FunDecl)
+%        | Using
+%
+% FunDecl ::= Id ':' Type                             // Type signature
+%           | Id Args [':' Type] '=' Block(Stmt)      // Definition
+%           | Id Args [':' Type] Block(GuardedDef)    // Guarded definitions
+%
+% GuardedDef ::= '|' Sep1(Expr, ',') '=' Block(Stmt)
+%
+% Using ::= 'using' Con ['as' Con] [UsingParts]
+% UsingParts ::= 'for' '[' Sep1(Id, ',') ']'
+%              | 'hiding' '[' Sep1(Id, ',') ']'
+%
+% PragmaOp ::= '<' | '=<' | '==' | '>=' | '>'
+% Version  ::= Sep1(Int, '.')
+%
+% EModifier ::= 'payable' | 'stateful'
+% FModifier ::= 'stateful' | 'private'
+%
+% Args ::= '(' Sep(Pattern, ',') ')'
+%
+% Type ::= Domain '=>' Type             // Function type
+%        | Type '(' Sep(Type, ',') ')'  // Type application
+%        | '(' Type ')'                 // Parens
+%        | 'unit' | Sep(Type, '*')      // Tuples
+%        | Id | QId | TVar
+%
+% Domain ::= Type                       // Single argument
+%          | '(' Sep(Type, ',') ')'     // Multiple arguments
+% ```
+-module(sfc_ast).
+
+%-compile([export_all,nowarn_export_all]).
+%
+%%-export_type([
+%%]).
+%%
+%%-export([
+%%    slurp/2
+%%]).
+%
+%-export([
+%    gulp_file/1
+%]).
+%
+%-include("$sfc_include/sfc.hrl").
+%
+%%%-----------------------------
+%%% TYPES: sfc_ast
+%%%-----------------------------
+%
+%
+%% % placeholders
+%-type ast_() :: any().
+%-record(ast_nyi, {tokens = none :: [sfc_token()]}).
+%-type ast_nyi() :: #ast_nyi{}.
+%
+%
+%% product type: foo * bar * baz
+%%
+%% stupid weird implication from bad syntax foresight trying to be
+%% fancy and overload what parens do is products must always have at
+%% least two operands; probably this is because `(foo)` is always the
+%% same as `foo`
+%%
+%% - 0-tuple -> `unit`
+%% - 1-tuple -> type itself
+%% - 2+      -> here
+%-record(ast_te_pi,
+%        {types = none :: none | [ast_te_()]}).
+%
+%-type ast_type_expr()
+%    :: #ast_te_fn{}   % function    (string, string) => string
+%     | #ast_te_ap{}   % application map(string, int)
+%     | #ast_te_pi{}   % product     foo * bar * baz
+%     | #ast_te_tk{}   % token       string int 'a Foo.Bar.baz
+%     | ast_nyi().
+%-type ast_te() :: ast_type_expr().
+%
+%
+%% ta = type alias
+%% 'type' Id ['(' TVar* ')'] '=' TypeAlias
+%-record(ast_ta,
+%        {alias     = none :: none | sfc_token(),
+%         tvars     = none :: none | [sfc_token()],
+%         points_to = none :: none | ast_type_expr()}).
+%
+%% Decl ::= 'type'     Id ['(' TVar* ')'] '=' TypeAlias
+%%        | 'record'   Id ['(' TVar* ')'] '=' RecordType
+%%        | 'datatype' Id ['(' TVar* ')'] '=' DataType
+%%        | 'let'      Id [':' Type]      '=' Expr
+%%        | (EModifier* 'entrypoint' | FModifier* 'function') Block(FunDecl)
+%%        | Using
+%-type decl() :: #ast_ta{}
+%              | #ast_nyi{}.
+%
+%% ['payable'] ['main'] 'contract' Con [Implement] '=' Block(Decl)
+%-record(ast_ct,
+%        {payable = none :: none | boolean(),
+%         main    = none :: none | boolean(),
+%         name    = none :: none | sfc_token(),
+%         impls   = none :: none | [sfc_token()],
+%         decls   = none :: none | [decl()]}).
+%
+%-record(ast_td_iface,{}).
+%-record(ast_td_ns,{}).
+%-record(ast_td_compiler,{}).
+%-record(ast_td_include,{}).
+%-record(ast_using,{}).
+%
+%% @doc
+%% TopDecl ::= ['payable'] ['main'] 'contract' Con [Implement] '=' Block(Decl)
+%%           | 'contract' 'interface' Con [Implement] '=' Block(Decl)
+%%           | 'namespace' Con '=' Block(Decl)
+%%           | '@compiler' PragmaOp Version
+%%           | 'include' String
+%%           | Using
+%-type top_decl() :: #ast_ct{}
+%                  | #ast_td_iface{}
+%                  | #ast_td_ns{}
+%                  | #ast_td_compiler{}
+%                  | #ast_td_include{}
+%                  | #ast_using{}
+%                  .
+%
+%-record(ast_file,
+%        {top_decls :: [top_decl()]}).
+%-type ast_file() :: #ast_file{}.
+%
+%
+%-type parse_error_() :: any().
+%-record(parse_error,
+%        {pos   = none :: none | sfc_pos(),
+%         msg   = ""   :: string(),
+%         subs  = []   :: [parse_error_()],
+%         extra = none :: any()}).
+%-type parse_error() :: parse_error().
+%
+%
+%-type ast()
+%    :: #ast_ct{}
+%     | #ast_file{}
+%     | #ast_nyi{}
+%     | #ast_ta{}
+%     | #ast_td_compiler{}
+%     | #ast_td_iface{}
+%     | #ast_td_include{}
+%     | #ast_td_ns{}
+%     | #ast_te_ap{}   % application map(string, int)
+%     | #ast_te_fn{}   % function    (string, string) => string
+%     | #ast_te_pi{}   % product     foo * bar * baz
+%     | #ast_te_tk{}   % token       string int 'a Foo.Bar.baz
+%     | #ast_using{}.
+%
+%
+%
+%
+%
+%%%-----------------------------
+%%% FUNCTIONS
+%%%-----------------------------
+%
+%-spec gulp_file(Tokens) -> Perhaps
+%    when Tokens :: [sfc_token()],
+%         Perhaps :: {gulp, #ast_file{}}
+%                  | {error, #parse_error{}}.
+%
+%gulp_file([]) ->
+%    {error, empty_file};
+%gulp_file(Tokens) ->
+%    case sfc_tokens:take_block(Tokens) of
+%        {Tokens, []} ->
+%            gulp_block(fun gulp_top_decl/1, Tokens);
+%            %gulp_file2([], [], Tokens);
+%        {A, B} ->
+%            StartPos = sfc_tokens:start_pos(A),
+%            ErrPos = sfc_tokens:start_pos(B),
+%            Msg = efmt("gulp_file: block starting at ~p ends at ~p instead of EOF",
+%                       [StartPos, ErrPos]),
+%            {error, #parse_error{pos = ErrPos, msg = Msg}}
+%    end.
+%
+%
+%
+%%gulp_file2(AccOks, AccErrs, Tokens = [_ | _]) ->
+%%    % ItemTokens will be nonempty
+%%    {ItemTokens, NewTokens} = sfc_tokens:take_block_item(Tokens),
+%%    case gulp_top_decl(ItemTokens) of
+%%        {gulp, Ok} -> gulp_file2([Ok | AccOks], AccErrs, NewTokens);
+%%        Err        -> gulp_file2(AccOks, [Err | AccErrs], NewTokens)
+%%    end;
+%%gulp_file2(AccOks, [], []) ->
+%%    TopDecls = lists:reverse(AccOks),
+%%    {gulp, #ast_file{top_decls = TopDecls}};
+%%gulp_file2(_, AccErrs, []) ->
+%%    Errs = [#parse_error{pos = Pos} | _]
+%%         = lists:reverse(AccErrs),
+%%    {error, #parse_error{pos = Pos, subs = Errs}}.
+%
+%-spec gulp_block(GulpItem, Tokens) -> GulpedItems
+%    when GulpItem    :: fun((ItemTokens) -> GulpedItem),
+%         Tokens      :: [sfc_token()],
+%         ItemTokens  :: Tokens,
+%         GulpedItem  :: {gulp, Item} | {error, Reason},
+%         GulpedItems :: {gulp, Items} | {error, Reason},
+%         Items       :: [Item],
+%         Item        :: ast(),
+%         Reason      :: any().
+%% @doc
+%% this almost is the familiar `traverse' idiom from Haskell
+%%
+%%   ghci> :t traverse
+%%   traverse
+%%     :: (Traversable t, Applicative f) => (a -> f b) -> t a -> f (t b)
+%%   ghci> :t mapM
+%%   mapM :: (Traversable t, Monad m) => (a -> m b) -> t a -> m (t b)
+%%
+%% (or `mapM' if youre being sloppy)
+%%
+%% however in our case our function is actually of
+%% signature
+%%
+%%   :: (t a -> f b)
+%%
+%% so this is more sophisticated
+%% @end
+%
+%gulp_block(GulpItem, Tokens) ->
+%    gulp_block(GulpItem, [], [], Tokens).
+%
+%gulp_block(GulpItem, AccOks, AccErrs, Tokens = [_ | _]) ->
+%    % ItemTokens will be nonempty
+%    {ItemTokens, NewTokens} = sfc_tokens:take_block_item(Tokens),
+%    case GulpItem(ItemTokens) of
+%        {gulp, Ok} -> gulp_block(GulpItem, [Ok | AccOks], AccErrs, NewTokens);
+%        Err        -> gulp_block(GulpItem, AccOks, [Err | AccErrs], NewTokens)
+%    end;
+%gulp_block(_, AccOks, [], []) ->
+%    Items = lists:reverse(AccOks),
+%    {gulp, Items};
+%gulp_block(_, _, AccErrs, []) ->
+%    {error, #parse_error{subs = lists:reverse(AccErrs)}}.
+%
+%
+%
+%-spec gulp_top_decl(DeclTokens) -> Result
+%    when DeclTokens :: [sfc_token()],
+%         Result     :: {gulp, ast()} | {error, any()}.
+%
+%% @doc
+%% TopDecl ::= ['payable'] ['main'] 'contract' Con [Implement] '=' Block(Decl)
+%%           | 'contract' 'interface' Con [Implement] '=' Block(Decl)
+%%           | 'namespace' Con '=' Block(Decl)
+%%           | '@compiler' PragmaOp Version
+%%           | 'include' String
+%%           | Using
+%% @end
+%gulp_top_decl(DeclTokens) ->
+%    case sfc_tokens:strings(3, DeclTokens) of
+%        ["payable", "contract", "interface"] ->
+%            gulp_nyi(DeclTokens);
+%        ["contract", "interface" | _] ->
+%            gulp_nyi(DeclTokens);
+%        ["payable", "main", "contract"] ->
+%            gulp_ct2(#ast_ct{payable = true, main = true},
+%                     drop(3, DeclTokens));
+%        ["main", "contract" | _] ->
+%            gulp_ct2(#ast_ct{payable = false, main = true},
+%                     drop(2, DeclTokens));
+%        ["payable", "contract" | _] ->
+%            gulp_ct2(#ast_ct{payable = true, main = false},
+%                     drop(2, DeclTokens));
+%        ["contract" | _]  ->
+%            gulp_ct2(#ast_ct{payable = false, main = false},
+%                     drop(1, DeclTokens));
+%        ["namespace" | _] ->
+%            gulp_nyi(DeclTokens);
+%        ["@", "compiler" | _] ->
+%            gulp_nyi(DeclTokens);
+%        ["include" | _] ->
+%            gulp_nyi(DeclTokens);
+%        ["using" | _ ] ->
+%            gulp_nyi(DeclTokens);
+%        _ ->
+%            % decl tokens will always be nonempty
+%            [H = #sfc_token{pos = ErrPos} | _] = DeclTokens,
+%            EMsg = efmt("gulp_top_decl: bad token: ~p; "
+%                        "expecting one of: [payable, main, contract, namespace, @compiler, include, using]",
+%                        [H]),
+%            {error, #parse_error{pos = ErrPos,
+%                                 msg = EMsg}}
+%    end.
+%
+%
+%gulp_ct2(Ast, Tokens = [#sfc_token{pos = Pos} | _]) ->
+%    % need to pass through pos for error messages on premature end of
+%    % input... for now it's a fixme thing
+%    gulp_ct3(Ast, Tokens, Pos).
+%
+%
+%% payable main contract X : Y, Z, W = <block>
+%%                       ^
+%gulp_ct3(Ast = #ast_ct{name = none}, Tokens, Pos) ->
+%    NewPos = Pos,   %% fixme
+%    case Tokens of
+%        [Name = #sfc_token{type = con} | NewTokens] ->
+%            NewAst = Ast#ast_ct{name = Name},
+%            gulp_ct3(NewAst, NewTokens, NewPos);
+%        [Name | _] ->
+%            {error, {fixme, {bad_contract_name, Name}}};
+%        [] ->
+%            {error, {fixme, no_contract_name}}
+%    end;
+%% contract X = <block>
+%% contract X : Y, Z, W = <block>
+%%            ^
+%gulp_ct3(Ast = #ast_ct{impls = none}, Tokens, Pos) ->
+%    NewPos = Pos,    %% fixme
+%    case slurp_ct_impls(Tokens) of
+%        {slurp, Impls, NewTokens} ->
+%            NewAst = Ast#ast_ct{impls = Impls},
+%            gulp_ct3(NewAst, NewTokens, NewPos);
+%        Error ->
+%            Error
+%    end;
+%% contract X           = <block>
+%% contract X : Y, Z, W = <block>
+%%                      ^
+%gulp_ct3(Ast = #ast_ct{decls = none},
+%         [#sfc_token{string = "="} | Tokens],
+%         _Pos) ->
+%    case gulp_block(fun gulp_decl/1, Tokens) of
+%        {gulp, Decls} ->
+%            Result = Ast#ast_ct{decls = Decls},
+%            {gulp, Result};
+%        Error ->
+%            Error
+%    end;
+%gulp_ct3(Ast, Tokens, Pos) ->
+%    {error, {fixme, {nyi, gulp_ct3, [Ast, Tokens, Pos]}}}.
+%
+%
+%
+%-spec slurp_ct_impls(Tokens) -> Slurped
+%    when Tokens    :: [sfc_token()],
+%         Slurped   :: {slurp, Impls, NewTokens}
+%                    | {error, Reason},
+%         Impls     :: Tokens,
+%         NewTokens :: Tokens,
+%         Reason    :: any().
+%% @doc ": X, Y, Z = ..." ~> {slurp, [X, Y, Z], "= ..."}
+%
+%% FIXME: this should be a restructured a tiny bit for better error
+%% handling
+%slurp_ct_impls([_   = #sfc_token{string = ":"},
+%                Con = #sfc_token{type = con}
+%                | NewTokens]) ->
+%    slurp_ct_impls2([Con], NewTokens);
+%slurp_ct_impls(NewTokens = [#sfc_token{string = "="} | _]) ->
+%    {slurp, [], NewTokens};
+%slurp_ct_impls([BadToken | _]) ->
+%    Msg = efmt("slurp_ct_impls: expecting (: Con | =), got ~p", [BadToken]),
+%    {error, {fixme, Msg}};
+%slurp_ct_impls([]) ->
+%    {error, {fixme, "expecting : or =, got end-of-input"}}.
+%
+%
+%slurp_ct_impls2(Stk, [_   = #sfc_token{string = ","},
+%                      Con = #sfc_token{type = con}
+%                      | NewTokens]) ->
+%    slurp_ct_impls2([Con | Stk], NewTokens);
+%slurp_ct_impls2(Stk, NewTokens = [#sfc_token{string = "="} | _]) ->
+%    {slurp, lists:reverse(Stk), NewTokens};
+%slurp_ct_impls2(Stk, BadTokens) ->
+%    {error, {fixme, nyi, slurp_ct_impls2, [Stk, BadTokens]}}.
+%
+%
+%% Decl ::= 'type'     Id ['(' TVar* ')'] '=' TypeAlias
+%%        | 'record'   Id ['(' TVar* ')'] '=' RecordType
+%%        | 'datatype' Id ['(' TVar* ')'] '=' DataType
+%%        | 'let'      Id [':' Type]      '=' Expr
+%%        | (EModifier* 'entrypoint' | FModifier* 'function') Block(FunDecl)
+%%        | Using
+%gulp_decl(Tokens) ->
+%    case sfc_tokens:strings(1, Tokens) of
+%        ["type"] -> gulp_type_alias(Tokens);
+%        _        -> gulp_nyi(Tokens)
+%    end.
+%
+%% 'type' Id ['(' TVar* ')'] '=' TypeAlias
+%gulp_type_alias(Tokens) ->
+%    gulp_ta(#ast_ta{}, Tokens).
+%
+%
+%% 'type' Id ['(' TVar* ')'] '=' TypeAlias
+%%        ^
+%gulp_ta(Ast  = #ast_ta{alias = none},
+%        _   = [#sfc_token{string = "type"}, Alias = #sfc_token{type = id}
+%               | NewTokens]) ->
+%    NewAst = Ast#ast_ta{alias = Alias},
+%    gulp_ta(NewAst, NewTokens);
+%% 'type' Id ['(' TVar* ')'] '=' TypeAlias
+%%           ^
+%gulp_ta(Ast = #ast_ta{tvars = none},
+%        Tokens) ->
+%    case slurp_ta_tvars(Tokens) of
+%        {slurp, TVars, NewTokens} ->
+%            NewAst = Ast#ast_ta{tvars = TVars},
+%            gulp_ta(NewAst, NewTokens);
+%        Error -> Error
+%    end;
+%% 'type' Id ['(' TVar* ')'] '=' TypeAlias
+%%                           ^
+%gulp_ta(Ast = #ast_ta{points_to = none},
+%        _   = [#sfc_token{string = "="} | NewTokens]) ->
+%    case gulp_type_expr(NewTokens) of
+%        {gulp, TypeExpr} ->
+%            Result = Ast#ast_ta{points_to = TypeExpr},
+%            {gulp, Result};
+%        Error -> Error
+%    end;
+%gulp_ta(A, B) ->
+%    {error, {fixme, gulp_ta, A, B, ?MODULE, ?LINE}}.
+%
+%
+%% type foo(bar, baz) = quux
+%%         ^
+%%
+%% "(bar, baz) = ..." ~> {slurp, [bar, baz], "= ..."}
+%slurp_ta_tvars(Tks = [#sfc_token{string = "="} | _]) ->
+%    {slurp, [], Tks};
+%slurp_ta_tvars([#sfc_token{string = "("},
+%                #sfc_token{string = ")"}
+%               | NewTokens]) ->
+%    {slurp, [], NewTokens};
+%slurp_ta_tvars([_    = #sfc_token{string = "("},
+%                TVar = #sfc_token{type = tvar}
+%               | NewTokens]) ->
+%    slurp_tavars([TVar], NewTokens).
+%
+%
+%slurp_tavars(Stk,
+%             [_    = #sfc_token{string = ","},
+%              TVar = #sfc_token{type = tvar}
+%             | NewTks]) ->
+%    slurp_tavars([TVar | Stk], NewTks);
+%slurp_tavars(Stk,
+%             [#sfc_token{string = ")"}
+%             | NewTks]) ->
+%    {slurp, lists:reverse(Stk), NewTks}.
+%
+%
+%gulp_type_expr(Tokens) ->
+%    case slurp_type_expr(Tokens) of
+%        {slurp, T, []} -> {gulp, T};
+%        {slurp, _, E}  -> {error, {fixme, trailing, E}};
+%        Error          -> Error
+%    end.
+%
+%
+%% Type ::= Domain '=>' Type             // Function type
+%%        | Type '(' Sep(Type, ',') ')'  // Type application
+%%        | '(' Type ')'                 // Parens
+%%        | 'unit'
+%%        | Sep(Type, '*')      // Tuples
+%%        | Id
+%%        | QId
+%%        | TVar
+%%
+%% Domain ::= Type                       // Single argument
+%%          | '(' Sep(Type, ',') ')'     // Multiple arguments
+%
+%% terminals:
+%% TkType   = id | qid | tvar
+%% PList    = PList0 | PList1 | PList2
+%% ProdType = Type [* Type]+
+%% LamType  = Type [=> Type]+
+%%
+%% PList0 = ()
+%% PList1 = (Type)
+%% PList2 = (Type [, Type]+)
+%%
+%% Ah ok:
+%%
+%%   Type1 = TkType | PList(Type)
+%%
+%%   Type = Type1                % string, (string), (string, string)
+%%        | Type1 => Type        % function
+%%        | Type1 PList(Type)    % application
+%%        | Type1 * Type         % product
+%
+%
+%% each type either starts with an id/qid/tvar token or an open paren
+%%
+%% we're going to eliminate open paren cases first:
+%%
+%% legal cases:
+%%
+%% 1. Function types can have arbitrary #args:
+%%
+%%       ()     => ...  // legal
+%%       (_)    => ...  // legal
+%%       (_, _) => ...  // legal
+%%
+%% 2. Otherwise it must be a plist of ONLY one type
+%%
+%%       ()      // illegal, should be "unit"
+%%       (X)     // legal, equiv to X
+%%       (X, Y)  // illegal, should be X * Y
+%%
+%slurp_type_expr(Tks = [#sfc_token{pos = Pos} | _]) ->
+%    case slurp_type1(Tks) of
+%        % plist cases
+%        % function types _ => _
+%        {slurp, {plist, PList}, NewTks} ->
+%            case {PList, NewTks} of
+%                % ANY plist (including (), (foo)) followed by => is fine
+%                %   () => _
+%                %   (_) => _
+%                %   (_, _) => _
+%                {DomainType, [#sfc_token{string = "=>"} | After]} ->
+%                    case slurp_type_expr(After) of
+%                        {slurp, CodomainType, NewTks} ->
+%                            TypeExpr = #ast_te_fn{dom   = DomainType,
+%                                                  codom = CodomainType},
+%                            {slurp, TypeExpr, NewTks};
+%                        Error = {error, _} ->
+%                            Error
+%                    end;
+%                % WMA not a function type, so only plists with 1 type are
+%                % allowed
+%                {[One], After} ->
+%                    slurp_type_expr2(One, After);
+%                % plists of 0 or 2+ not allowed
+%                {[], _} ->
+%                    Msg = "() is called unit",
+%                    {error, {fixme, ?MODULE, ?LINE, illegal_type, Pos, Msg}};
+%                {_, _} ->
+%                    Msg = "(_, _, ...) must be followed by =>; "
+%                          "the type for tuples (A, B) is A * B",
+%                    {error, {fixme, ?MODULE, ?LINE, illegal_type, Pos, Msg}}
+%            end;
+%        % first token is id|qid|tvar: foo, Foo.bar, 'foo
+%        {slurp, {token, Tk}, NewTks} ->
+%            case NewTks of
+%                % foo => bar is fine, normalized to (foo) => bar
+%                [#sfc_token{string = "=>"} | After] ->
+%                    case slurp_type_expr(After) of
+%                        {slurp, Codom, NewAfter} ->
+%                            ArgType = #ast_te_tk{token = ArgTypeTk},
+%                            Dom     = [ArgType],
+%                            Result  = #ast_te_fn{dom = Dom, codom = Codom},
+%                            {slurp, Result, NewAfter};
+%                        Error = {error, _} ->
+%                            Error
+%                    end;
+%                {_, _} ->
+%                    One = #ast_te_tk{token = Tk},
+%                    slurp_type_expr2(One, After)
+%            end;
+%        Error = {error, _} ->
+%            Error
+%    end.
+%
+%
+%% parsed the first type, now need to see if this is an application
+%% or a product or if we're done
+%slurp_type_expr2(FirstType, After) ->
+%    % fixme: just going to ignore products and applications for now
+%    {slurp, FirstType, After}.
+%    %case strings(1, After) of
+%    %    ["("] -> error(nyi);
+%    %    ["*"] -> error(nyi);
+%    %case After of
+%    %    [#sfc_token{string = "("} | _] ->
+%    %        case slurp_type1_II(After) of
+%    %            {slurp, {plist, ArgTypes}, NewAfter} ->
+%    %                {slurp, #ast_te_ap{fn = FirstType
+%
+%
+%
+%
+%% Type1 = {plist, Types}            ()   (foo)    (foo, bar)
+%%       | {token, #sfc_token{}}     foo  Bar.baz  'quux
+%slurp_type1(Tks) ->
+%    case sfc_tokens:slurp_plist(Tks) of
+%        % head token is NOT open paren -> must be id/qid/tvar
+%        {slurp, [], [Tk | NewTks]} ->
+%            TkType = Tk#sfc_token.type,
+%            case TkType of
+%                id   -> {slurp, {token, Tk}, NewTks};
+%                qid  -> {slurp, {token, Tk}, NewTks};
+%                tvar -> {slurp, {token, Tk}, NewTks};
+%                _    -> {error, {fixme, illegal_type, Tk, ?MODULE, ?LINE}}
+%            end;
+%        % head token is an open paren, need to recursively consume
+%        % the inner types
+%        {slurp, PListTokens, After} ->
+%            case gulp_ptype1(PListTokens) of
+%                {gulp, Types}      -> {slurp, {plist, Types}, After};
+%                Error = {error, _} -> Error
+%            end;
+%        Error = {error, _} -> Error
+%    end.
+%
+%
+%%slurp_type_expr_plist(Tks) ->
+%%    case sfc_tokens:slurp_plist(Tks) of
+%%        % head token is NOT open paren -> must be id/qid/tvar
+%%        {slurp, [], [Tk | NewTks]} ->
+%%            TkType = Tk#sfc_token.type,
+%%            case TkType of
+%%                id   -> {slurp, {token, Tk}, NewTks};
+%%                qid  -> {slurp, {token, Tk}, NewTks};
+%%                tvar -> {slurp, {token, Tk}, NewTks};
+%%                _    -> {error, {fixme, illegal_type, Tk}}
+%%            end;
+%%        % head token is an open paren, need to recursively consume
+%%        % the inner types
+%%        {slurp, PListTokens, After} ->
+%%            case gulp_ptype1(PListTokens) of
+%%                {gulp, Types}      -> {slurp, {plist, Types}, After};
+%%                Error = {error, _} -> Error
+%%            end;
+%%        Error = {error, _} -> Error
+%%    end.
+%
+%gulp_ptype1([#sfc_token{string = "("}, #sfc_token{string = ")"}]) ->
+%    {gulp, []};
+%gulp_ptype1([#sfc_token{string = "("} | Tail]) ->
+%    gulp_ptype1_II([], Tail).
+%
+%gulp_ptype1_II(Stk, Tks) ->
+%    case slurp_type_expr(Tks) of
+%        {slurp, NewType, [#sfc_token{string = ")"}]} ->
+%            {gulp, lists:reverse([NewType | Stk])};
+%        {slurp, NewType, [#sfc_token{string = ","} | NewTks]} ->
+%            gulp_ptype1_II([NewType | Stk], NewTks);
+%        Error = {error, _} ->
+%            Error
+%    end.
+%
+%
+%%gulp_te_tk([Tk = #sfc_token{type = TkType}])
+%%        when id   =:= TkType;
+%%             qid  =:= TkType;
+%%             tvar =:= TkType ->
+%%    {gulp, #ast_te_tk{token = Tk}};
+%%gulp_te_tk(Tks) ->
+%%    {error, {fixme, nyi, gulp_te_tks, Tks}}.
+%
+%
+%gulp_nyi(Tokens) ->
+%    {gulp, #ast_nyi{tokens = Tokens}}.
+%
+%
+%drop(N, [_ | Xs]) when is_integer(N), N >= 1 ->
+%    drop(N-1, Xs);
+%drop(_, []) ->
+%    [];
+%drop(0, X) ->
+%    X.
+%
+%efmt(Fmt, Args) ->
+%    unicode:characters_to_list(io_lib:format(Fmt, Args)).
@@ -0,0 +1,85 @@
+% @doc
+% sfc_bst = ast second attempt but prefix so tab complete
+%
+% from docs/sophia/so_syntax.md:
+%
+% File ::= Block(TopDecl)
+%
+% TopDecl ::= ['payable'] ['main'] 'contract' Con [Implement] '=' Block(Decl)
+%           | 'contract' 'interface' Con [Implement] '=' Block(Decl)
+%           | 'namespace' Con '=' Block(Decl)
+%           | '@compiler' PragmaOp Version
+%           | 'include' String
+%           | Using
+%
+% Implement ::= ':' Sep1(Con, ',')
+%
+% Decl ::= 'type'     Id ['(' TVar* ')'] '=' TypeAlias
+%        | 'record'   Id ['(' TVar* ')'] '=' RecordType
+%        | 'datatype' Id ['(' TVar* ')'] '=' DataType
+%        | 'let'      Id [':' Type]      '=' Expr
+%        | (EModifier* 'entrypoint' | FModifier* 'function') Block(FunDecl)
+%        | Using
+%
+% FunDecl ::= Id ':' Type                             // Type signature
+%           | Id Args [':' Type] '=' Block(Stmt)      // Definition
+%           | Id Args [':' Type] Block(GuardedDef)    // Guarded definitions
+%
+% GuardedDef ::= '|' Sep1(Expr, ',') '=' Block(Stmt)
+%
+% Using ::= 'using' Con ['as' Con] [UsingParts]
+% UsingParts ::= 'for' '[' Sep1(Id, ',') ']'
+%              | 'hiding' '[' Sep1(Id, ',') ']'
+%
+% PragmaOp ::= '<' | '=<' | '==' | '>=' | '>'
+% Version  ::= Sep1(Int, '.')
+%
+% EModifier ::= 'payable' | 'stateful'
+% FModifier ::= 'stateful' | 'private'
+%
+% Args ::= '(' Sep(Pattern, ',') ')'
+-module(sfc_bst).
+
+-compile([export_all, nowarn_export_all]).
+-include("$sfc_include/sfc.hrl").
+
+%-record(bst_nyi, {tokens :: [sfc_token()]).
+%
+%% ['payable'] ['main'] 'contract' Con [Implement] '=' Block(Decl)
+%-record(bst_ct,
+%        {payable    = none :: none | boolean(),
+%         main       = none :: none | boolean(),
+%         name       = none :: none | string(),
+%         impl       = none :: none | [string()],
+%         eq         = none :: none | '=',
+%         decls      = none :: none | [decl()]}).
+%
+%% 'contract' 'interface' Con [Implement] '=' Block(Decl)
+%-record(bst_iface,
+%        {payable    = none :: none | boolean()
+%         name       = none :: none | string(),
+%         impl       = none :: none | [string()],
+%         eq         = none :: none | '=',
+%         decls      = none :: none | [decl()]}).
+%
+%
+%
+%%   File ::= Block(TopDecl)
+%gulp(file, Tokens) ->
+%    gulp({block, top_decl}, Tokens);
+%%   TopDecl ::= ['payable'] ['main'] 'contract' Con [Implement] '=' Block(Decl)
+%%             | 'contract' 'interface' Con [Implement] '=' Block(Decl)
+%%             | 'namespace' Con '=' Block(Decl)
+%%             | '@compiler' PragmaOp Version
+%%             | 'include' String
+%%             | Using
+%gulp(top_decl, [#sfc_token{string = S} | Rest]) ->
+%    case strings(3, Tokens) of
+%        ["payable", "contract", "interface"] ->
+%            gulp_ct(#bst_iface{payable = true, main = true}, drop(3, Tokens));
+%        ["payable", "contract",  _] ->
+%            gulp_ct(#bst_ct{payable = true, main = false}, drop(2, Tokens));
+%        ["contract" | _] ->
+%    gulp({oneof, [bst_iface, bst_ct, bst_nyi]}, Tokens);
+%gulp(bst_ct, Tokens) ->
+%gulp(top_decl, Tokens) ->
@@ -0,0 +1,216 @@
+-module(sfc_parse_type_expr).
+
+-export_type([
+]).
+
+-export([
+    unsafe_vtks_from_string/1,
+    gulp_vtks/1,
+    take_until_ifx_op/1
+]).
+
+-include("$sfc_include/sfc.hrl").
+
+
+%------------------------------------------------------
+% TYPES
+%------------------------------------------------------
+
+-type vtk_ifx_op() :: vtk_apply_to
+                    | {'vtk_*', sfc_token()}
+                    | {'vtk_=>', sfc_token()}.
+
+-type vtk() :: sfc_token()
+             | {vtk_plist, [sfc_token()]}
+             | vtk_ifx_op().
+
+
+-type gulped(X) :: {gulp, X}
+                 | {error, any()}.
+
+-type slurped(X) :: {slurp, X, Rest :: [sfc_token()]}
+                  | {error, any()}.
+
+
+%------------------------------------------------------
+% FUNCTIONS
+%------------------------------------------------------
+
+
+-spec unsafe_vtks_from_string(SrcStr) -> Vtks when
+        SrcStr :: string(),
+        Vtks   :: [vtk()].
+
+% @doc for testing
+unsafe_vtks_from_string(S) ->
+    {ok, SigTks} = sfc_tokens:significant_tokens(S),
+    {gulp, Vtks} = gulp_vtks(SigTks),
+    Vtks.
+
+
+% operators in descending order of exteriority
+%
+% precedence verbiage confuses me
+%
+%   1 + 2 * 3 ^ 4
+%
+%   (+ 1 (* 2 (^ 3 4)))
+%
+% precedence is thinking about the operators as like
+% having arms and legs and doing something. how much
+% power do they have to bind to their neighbors. it's
+% thinking of your operators as verbs in some state
+% machine.
+%
+% i don't like when i have to model the state machine
+% in my head when i'm reading code. everything should
+% just be there. exteriority and interiority are
+% properties of nouns, not properties of verbs.
+%
+% functional programming is all about offloading as
+% much of your reasoning into nouns as possible. verbs
+% change things. and change is bad.
+-type chunk_strategy()
+        :: ast_parens   % prefix paren (x y z)
+         | '=>'         % infixr
+         | '*'          % infix
+         | 'apply'      % postfix paren  f (...)
+         .
+
+
+-record(ast_parens,
+        {open  = none :: none | sfc_token(),
+         inner = none :: none | [sfc_token()],
+         close = none :: none | sfc_token()}).
+
+chunk_by(Strategy, Tokens) ->
+    chunk_by(Strategy, [], Tokens).
+
+
+-spec chunk0(Strategy, Oks, Errs, Tokens) -> Result when
+        Strategy :: chunk_strategy(),
+        Oks      :: [any()],
+        Errs     :: [{error, Reason :: any()}],
+        Tokens   :: [sfc_token()],
+        Result   :: {ok, 
+
+gulp_chunks_by(_, Stk, [], []) ->
+    {gulp, lists:reverse(Stk)};
+gulp_chunks_by(_, _, Errs, []) ->
+    {error, {fixme, {?MODULE, ?LINE}, Errs}};
+gulp_chunks_by(plist, Stk, Errs, Tokens) ->
+    case slurp_plist_rec(Tokens) of
+        {slurp, Plist, NewTokens} ->
+            gulp_chunks_by(plist, [PList | Stk], Errs, NewTokens);
+        barf ->
+            [Token | NewTokens] = Tokens,
+            gulp_chunks_by(plist, [Token | Stk], Errs, NewTokens);
+        Error ->
+            gulp_chunks_by(plist, Stk, [Error | Errs], Tokens);
+    end.
+
+slurp_plist_rec(Tokens = [#sfc_token{string = "(" | _]) ->
+    case sfc_tokens:slurp_plist(Tokens) of
+        {slurp, [], _} ->
+            barf;
+        {slurp, PTokens, NewTokens} ->
+            PTokensInner = pt_inner(PTokens),
+    end;
+
+%-spec gulp_ifx_tree(Tokens) -> gulped(IfxTree) when
+%        Tokens :: [sfc_token()],
+%        IfxTree :: ifx_tree().
+%
+%-spec chunk_by(ChunkStrategy, Tokens) -> Result when
+%        ChunkStrategy :: chunk_strategy(),
+%        Tokens        :: [sfc_token()],
+%        Result        :: {ChunkStrategy, 
+
+
+
+-spec gulp_vtks(Tokens) -> Result when
+        Tokens          :: [sfc_token()],
+        Result          :: gulped(VirtualTokens),
+        VirtualTokens   :: [vtk()].
+
+% @private
+% for infix precedence/associativity resolution.
+%
+% better/original name (although misnomer) was
+% insert_virtual_tokens. we're using the gulp verbiage
+% because we're guarding against mistmatched delimiters
+% @end
+gulp_vtks(Tokens) ->
+    gulp_vtks([], Tokens).
+
+gulp_vtks(Acc, Tks0) ->
+    % scan until next "(" | "*" | "=>"
+    case take_until_ifx_op(Tks0) of
+        % no infix op remaining, return arg
+        {_Pfx = Tks0,
+         _Sfx = []} ->
+            {gulp, lists:flatten([Acc, Tks0])};
+        % application
+        % "... foo(bar, baz) ..."
+        %       ~> [..., foo, {plist, "(bar, baz)"}, ...]
+        {_Pfx = Tks1_BeforeOpen,
+         _Sfx = Tks2_OpenNAfter
+              = [#sfc_token{string = "("} | _]} ->
+            case sfc_tokens:slurp_plist(Tks2_OpenNAfter) of
+                {slurp, Tks2A_OpenToClose, Tks2B_AfterClose} ->
+                    NewAcc = [Acc,
+                              Tks1_BeforeOpen,
+                              vtk_apply_to,
+                              {vtk_plist, Tks2A_OpenToClose}],
+                    gulp_vtks(NewAcc, Tks2B_AfterClose);
+                Error = {error, _} ->
+                    Error
+            end;
+        % product
+        {_Pfx = Tks0_BeforeTimes,
+         _Sfx = [ Tk1A_Times = #sfc_token{string = "*"}
+                | Tks1B_AfterTimes]} ->
+            NewAcc = [Acc,
+                      Tks0_BeforeTimes,
+                      {'vtk_*', Tk1A_Times}],
+            gulp_vtks(NewAcc, Tks1B_AfterTimes);
+        % funType
+        {_Pfx = Tks0_BeforeOp,
+         _Sfx = [ Tk1A_Op = #sfc_token{string = "=>"}
+                | Tks1B_AfterOp]} ->
+            NewAcc = [Acc,
+                      Tks0_BeforeOp,
+                      {'vtk_=>', Tk1A_Op}],
+            gulp_vtks(NewAcc, Tks1B_AfterOp)
+    end.
+
+
+
+-spec take_until_ifx_op(Tokens) -> Result when
+        Tokens    :: [sfc_token()],
+        Result    :: {Taken, NewTokens},
+        Taken     :: Tokens,
+        NewTokens :: Tokens.
+% @doc
+% consume tokens until one of ["(", "*", "=>"]
+
+take_until_ifx_op(Tks) ->
+    take_until_ifx_op([], Tks).
+
+take_until_ifx_op(Stack, []) ->
+    {lists:reverse(Stack), []};
+take_until_ifx_op(Stack, Tokens = [Token | NewTokens]) ->
+    TokStr = Token#sfc_token.string,
+    Continue =
+        case TokStr of
+            % exit cases
+            "("  -> false;
+            "*"  -> false;
+            "=>" -> false;
+            _    -> true
+        end,
+    case Continue of
+        true  -> take_until_ifx_op([Token | Stack], NewTokens);
+        false -> {lists:reverse(Stack), Tokens}
+    end.
+
@@ -0,0 +1,457 @@
+% @doc compatibility layer to test against so_scan
+%
+% converts sfc_tokens data to so_scan tokens
+%
+% Ref: so_scan.erl
+-module(sfc_so_scan).
+
+-export_type([
+    so_kwd/0,
+    so_special_char/0,
+    so_symbol/0,
+    so_token2/0,
+    so_token3/0,
+    so_token/0
+]).
+
+-export([
+    scan/1,
+    ken_barson_rises/2
+]).
+
+-include("$sfc_include/sfc.hrl").
+
+%================================
+% API: types
+%================================
+
+% FIXME: single-quote all the atoms to future-proof against the elixir retards
+% adding more keywords to erlang
+-type so_kwd() :: contract
+                | include
+                | 'let'
+                | switch
+                | type
+                | record
+                | datatype
+                | 'if'
+                | elif
+                | 'else'
+                | function
+                | stateful
+                | payable
+                | 'true'
+                | 'false'
+                | mod
+                | public
+                | entrypoint
+                | private
+                | indexed
+                | namespace
+                | interface
+                | main
+                | using
+                | as
+                | for
+                | hiding
+                | 'band'
+                | 'bor'
+                | 'bxor'
+                | 'bnot'.
+
+-type so_special_char() :: '..'
+                         | ','
+                         | '.'
+                         | ';'
+                         | '('
+                         | ')'
+                         | '['
+                         | ']'
+                         | '{'
+                         | '}'
+                         .
+
+% @doc bad type... essentially a string that is the outcome of a regex match is
+% cast to an atom, and that's the type that goes here
+-type so_symbol() :: so_kwd() | so_special_char() | atom().
+
+-type so_token2() :: {Symbol   :: so_symbol(),
+                      Location :: sfc_pos()}.
+
+% FIXME
+% this is 'id', 'con', qid
+-type so_tk3type() :: char | string | hex | int | bytes | qid | qcon | tvar | id | con.
+
+-type so_token3() :: {TokenType  :: so_tk3type(),
+                      Location   :: sfc_pos(),
+                      TokenValue :: term()}.
+
+-type so_token() :: so_token2() | so_token3().
+
+
+%================================
+% API: functions
+%================================
+
+-spec scan(SrcStr) -> {ok, SoTokens} | {error, sfc_err()}
+    when SrcStr   :: iolist(),
+         SoTokens :: [so_token()].
+% @doc
+% this is meant to agree with so_scan:scan/1 in all cases
+%
+% this converts sfc's internal representation of tokens into the format that
+% so_scan outputs
+% @end
+
+scan(SrcStr) ->
+    case sfc_tokens:tokens(SrcStr) of
+        {ok, SfLTokens} ->
+            SoTokens = to_so_tokens(SfLTokens),
+            {ok, SoTokens};
+        % fucking stupid
+        {error, #sfc_err_bcom_unterminated{prev_tokens = SfcTokens}} ->
+            {ok, to_so_tokens(SfcTokens)};
+        Error ->
+            Error
+    end.
+
+
+
+-spec to_so_tokens(SfcTokens) -> SoTokens
+    when SfcTokens    :: [sfc_token()],
+         SoTokens     :: [so_token()].
+
+% @doc
+% most sfc tokens map 1-to-1 with so_tokens. the
+% exception is ak/ct/sg literals. this is a
+% many-to-one-mapping, and therefore ak, sg, ct need to
+% be handled at the list level.
+%
+% the reason is as follows:
+%
+% so_scan lexes ak_ABCD to an id, then at the parsing
+% stage computes the pubkey that corresponds to.
+%
+% as a result, if we have ak_GHI, I is not a valid
+% base58 char, so WE (sfc) end up lexing that as
+%
+%   [{ak, "ak_GH"}, {con, "I"}]
+%
+% and so_scan lexes that as {id, "ak_GHI"}].
+%
+% however we also don't ignore whitespace, so we can
+% tell if this happens because it occurs precisely when
+% an ak/sg/ct is immediately followed by a
+% non-whitespace token. however there could be more
+% than 1 and they can be a variety of different shapes.
+% so we have to greedily consume them back into a
+% single id.
+%
+% bugs in the happy path are trans-features
+%
+% so if we see an ak/ct/sg token, we summon evil ben
+% carson to reconjoin the unconjoined twins
+to_so_tokens([ AkTok = #sfc_token{type = AkCtSg, pos = Pos}
+             | Sheeit])
+        when ak =:= AkCtSg;
+             ct =:= AkCtSg;
+             sg =:= AkCtSg ->
+    {#sfc_token{string = FinalAkStr}, NewSheeit}
+        = ken_barson_rises(AkTok, Sheeit),
+    [{id, Pos, FinalAkStr}| to_so_tokens(NewSheeit)];
+% this part is just lists:filtermap
+to_so_tokens([X | Xs]) ->
+    case to_so_token(X) of
+        false           -> to_so_tokens(Xs);
+        {true, SoToken} -> [SoToken | to_so_tokens(Xs)]
+    end;
+to_so_tokens([]) ->
+    [].
+
+
+
+-spec ken_barson_rises(InitApiToken, SfToks) -> {FinalApiToken, NewSfToks}
+    when InitApiToken  :: sfc_token(),
+         SfToks        :: [sfc_token()],
+         FinalApiToken :: InitApiToken,
+         NewSfToks     :: SfToks.
+% @doc
+%
+%                  .-""""""""""""-.
+%               .-'   .-======-.   '-.
+%             .'     /  .----.  \     '.
+%            /      |  /      \  |      \
+%           |       | |  @  @  | |       |
+%           |       | |   __   | |       |
+%           |       | |  /@@\  | |       |
+%            \      | |  \__/  | |      /
+%             '.    |  \_++++_/  |    .'
+%               '-._|   |\/\/|   |_.-'
+%                   |   |/\/\|   |
+%                   |   \____/   |
+%                ___|  BEN CARSON |___
+%             .-'   |  HAS BECOME |   '-.
+%            /      |  TOO POWERFUL|      \
+%           /       |______________|       \
+%          /      .-'''-.      .-'''-.      \
+%         |      /  .-.  \    /  .-.  \      |
+%         |     |  (   )  |  |  (   )  |     |
+%         |      \  '-'  /    \  '-'  /      |
+%          \      '-...-'  /\  '-...-'      /
+%           '._           /  \           _.'
+%              '-._____.-'    '-._____.-'
+%
+%              THE SOFT-SPOKEN DOOM DOCTOR
+%               “I prescribed… CHAOS.”
+%
+%         BUGS IN THE HAPPY PATH ARE features.
+%         BUGS IN THE HAPPY PATH ARE features.
+%
+%                  WE LIKE features.
+%
+%               features MAKE US MONEY.
+%
+%                features ARE NOT FOOD.
+%                features ARE friends.
+% @end
+
+
+
+% This function takes the unconjoined twins (e.g.
+% `ak_GHI` lexed to `ak_GH` followed by `I`) and
+% recursively reconjoins them so they can all live
+% happily together as a single so_scan token which will
+% fail in the parsing step.
+%
+% on account of the property that the concatenation of
+% all the token strings equals the original source file
+% (FIXME: should test this in test suite)
+%
+% basically this looks at the next token, and if it's a
+% type that so_scan is going to consume as part of an
+% `id` token, then we add it to the stack.
+%
+% quoth claude:
+%   so_scan lexes identifiers with
+%   /[a-z_][a-zA-Z0-9_']*/. The base58 alphabet used by
+%   `smr_apistr58` is:
+%
+%       123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz
+%
+%   Characters **valid in a so_scan id tail** but
+%   **absent from base58**:
+%
+%   | char | why excluded from base58              |
+%   |------|---------------------------------------|
+%   | `0`  | looks like `O`                        |
+%   | `I`  | looks like `1` or `l`                 |
+%   | `O`  | looks like `0`                        |
+%   | `l`  | looks like `1` or `I`                 |
+%   | `_`  | not alphanumeric (structural, not b58)|
+%   | `'`  | not alphanumeric (sophia id quirk)    |
+%
+%   When any of these appear AFTER at least one valid
+%   base58 char in a `ak_`/`ct_`/`sg_` prefixed
+%   identifier, `sfc` splits what `so_scan` sees as one
+%   `id` token into 2+ sfc tokens.
+%
+%   **No split if non-base58 char is immediately after
+%   `_`**: `smr_plus` requires >=1 base58 char to
+%   match; `ak_I`, `ak_0`, `ak__bar` all fall
+%   through to `id` and both tokenizers agree.
+ken_barson_rises(AkTokAcc = #sfc_token{string = AkStr},
+                 SrcTokens = [#sfc_token{type   = CandidateType,
+                                         string = CandidateString}
+                             | Rest]) ->
+    % candidate:
+    % dig out the token type and the string
+    Smash = lists:member(CandidateType, smash_types()),
+    Pass = lists:member(CandidateType, pass_types()),
+    % sanity check
+    true = Smash or Pass,
+    if
+        Smash ->
+            % dig out the token from LcTokApi
+            NewAkStr    = AkStr ++ CandidateString,
+            NewAkTokAcc = AkTokAcc#sfc_token{string = NewAkStr},
+            ken_barson_rises(NewAkTokAcc, Rest);
+        Pass ->
+            {AkTokAcc, SrcTokens}
+    end;
+ken_barson_rises(Done, []) ->
+    {Done, []}.
+
+smash_types() ->
+    [char,      % ak_GH'a'  -> {char, "'a'"}
+     int16,     % ak_GH0xAB -> {int16, "0xAB"}
+     int10,     % ak_GH0123 -> {int10, "0123"}
+     tvar,      % ak_GH'a   -> {tvar, "'a"}
+     kwd,       % ak_GHlet  -> {kwd, "let"}
+     id,        % ak_GH_AB  -> {id, "_AB"}
+     con].      % ak_GHI    -> {con, "I"}
+
+pass_types() ->
+    % why each of these are impossible
+    % meaning the prefix for each of these will cause
+    % so_scan to break out of consuming an id, or will
+    % never be a disjoined
+    % neighbor
+    [lcom,      % ak_AB//           breaks out of id
+     bcom,      % ak_AB/*           breaks out of id
+     ws,        % ak_AB\t           breaks out of id
+     punct,     % ak_AB{            breaks out of id
+     string,    % ak_AB"            breaks out of id
+     bytes,     % ak_AB#            breaks out of id
+     ak,ct,sg,  % ak_ABak           [akctsg] all in base58 alphabet
+     qid,       % ak_ABI.Am.A.qid   ??? maybe sophia lexes this to [{id, _}, '.']?
+     qcon,      % ak_ABI.Am.A.QCon  ??? same
+     op].       % ak_AB=<           [=!<>+-*/:&|?~@^] break out of id
+
+
+
+-spec to_so_token(SfcToken) -> MaybeSoToken
+    when SfcToken     :: sfc_token(),
+         MaybeSoToken :: {true, SoToken}
+                       | false,
+         SoToken      :: so_token().
+
+% @private
+% does NOT handle ak/ct/sg because these may consume
+% follow-on tokens
+% @end
+
+to_so_token(#sfc_token{type   = SfTokenType,
+                           pos    = Pos,
+                           string = SfTokenStr}) ->
+    case SfTokenType of
+        %-----------------
+        % Ignored
+        %-----------------
+        bcom -> false;
+        lcom -> false;
+        ws   -> false;
+        %-----------------------
+        % {_, _}
+        %
+        % {contract, {420, 69}}
+        %-----------------------
+        % kwds ops and punct are all collapsed by
+        % so_scan:scan down to eg {'contract', {420, 69}}
+        % where {420, 69} is the source location
+        % these are three different parsers
+        Sym when Sym =:= kwd;
+                 Sym =:= op;
+                 Sym =:= punct ->
+            Symbol = list_to_atom(SfTokenStr),
+            {true, {Symbol, Pos}};
+        %------------------------------------
+        % {_, _, _}
+        %
+        % {id, {420, 69}, "foo"}
+        %--------------------------------
+        QVar when QVar =:= qid; QVar =:= qcon ->
+            % qualifieds tokenize to
+            % {qid, {420, 69}, ["Foo", "Bar", "baz"]}
+            {true, {QVar, Pos, string:tokens(SfTokenStr, ".")}};
+        SfVar when SfVar =:= id; SfVar =:= con; SfVar =:= tvar ->
+            {true, {SfVar, Pos, SfTokenStr}};
+        % literals
+        % from so_scan:
+        % {CHAR,   token(char,   fun parse_char/1)}
+        % {STRING, token(string, fun parse_string/1)}
+        % {HEX,    token(hex,    fun parse_hex/1)}
+        % {INT,    token(int,    fun parse_int/1)}
+        % {BYTES,  token(bytes,  fun parse_bytes/1)}
+        % so_scan casts strings to binary
+        char    -> {true, {char,   Pos, so_parse_char(SfTokenStr)}};
+        string  -> {true, {string, Pos, so_parse_string(SfTokenStr)}};
+        int16   -> {true, {hex,    Pos, so_parse_hex(SfTokenStr)}};
+        int10   -> {true, {int,    Pos, so_parse_int(SfTokenStr)}};
+        bytes   -> {true, {bytes,  Pos, so_parse_bytes(SfTokenStr)}};
+        NYI ->
+            Msg = io_lib:format("sfc_so_scan:to_so_token/1: unhandled token shape: ~p", [NYI]),
+            error(#sfc_err{atom   = nyi,
+                           string = Msg})
+   end.
+
+%% ak/ct/sg all tokenize to id
+%% FIXEDME: implement? it seems like so_scan just parses these as
+%% identifiers, so not clear what the advantage is here?
+%%
+%% i suppose we'll find out when we write the syntax parser
+%%
+%% so_scan lexes ak/ct/sg as ids and then parses them as addresses/sigs
+%% in the parsing step
+%API when API =:= ak;
+%         API =:= ct;
+%         API =:= sg ->
+%    {true, {id, Pos, SfTokenStr}};
+
+% copied from so_scan.erl
+so_parse_char([$' | Chars]) ->
+    case unicode:characters_to_nfc_list(unescape($', Chars, [])) of
+        [Char] -> Char;
+        _Bad   ->
+            error(#sfc_err{atom   = bad_token,
+                           string = "Bad character literal: '" ++ Chars})
+    end.
+
+so_parse_string([$" | Chars]) ->
+    unicode:characters_to_nfc_binary(unescape(Chars)).
+
+% FIXME: unfuck this shit
+%
+% this all works in some stupid fucking way because
+% so_scan operates on lists of bytes, rather than on
+% character-lists. So single codepoints have to be
+% converted to multi-byte sequences or some shit. We're
+% always working on lists, so this can probably be
+% simplified. I don't care enough at the moment to fix
+% this, but this function has been the source of
+% several annoying bugs
+unescape(Str) -> unescape($", Str, []).
+
+unescape(Delim, [Delim], Acc) ->
+    unicode:characters_to_binary(lists:reverse(Acc));
+unescape(Delim, [$\\, $x, ${ | Chars ], Acc) ->
+    {Ds, [_ | Cs]} = lists:splitwith(fun($}) -> false ; (_) -> true end, Chars),
+    C = list_to_integer(Ds, 16),
+    Utf8Cs = unicode:characters_to_nfc_list([C]),
+    unescape(Delim, Cs, [Utf8Cs | Acc]);
+unescape(Delim, [$\\, $x, D1, D2 | Chars ], Acc) ->
+    C = list_to_integer([D1, D2], 16),
+    Utf8Cs = unicode:characters_to_nfc_list([C]),
+    unescape(Delim, Chars, [Utf8Cs | Acc]);
+unescape(Delim, [$\\, Code | Chars], Acc) ->
+    Ok = fun(C) -> unescape(Delim, Chars, [C | Acc]) end,
+    case Code of
+        Delim -> Ok(Delim);
+        $\\ -> Ok($\\);
+        $b  -> Ok($\b);
+        $e  -> Ok($\e);
+        $f  -> Ok($\f);
+        $n  -> Ok($\n);
+        $r  -> Ok($\r);
+        $t  -> Ok($\t);
+        $v  -> Ok($\v);
+        _   -> error(#sfc_err{atom   = bad_escape_char,
+                              string = "Bad control sequence: \\" ++ [Code]})  %% TODO
+    end;
+unescape(Delim, [C | Chars], Acc) ->
+    unescape(Delim, Chars, [C | Acc]).
+
+
+so_parse_hex("0x" ++ S) ->
+    list_to_integer(strip_underscores(S), 16).
+
+so_parse_int(S) ->
+    list_to_integer(strip_underscores(S)).
+
+so_parse_bytes("#" ++ S0) ->
+    S      = strip_underscores(S0),
+    N      = list_to_integer(S, 16),
+    Digits = (length(S) + 1) div 2,
+    <<N:Digits/unit:8>>.
+
+strip_underscores(S) ->
+    lists:filter(fun(C) -> C /= $_ end, S).
@@ -0,0 +1,881 @@
+% @doc
+% A string matcher is roughly analogous to a regex. It describes a pattern,
+% which a string may or may not match.
+%
+% This module is essentially a pure erlang implementation of the subset of
+% regular expressions that are needed to tokenize sophia.
+%
+% The intent for now (May 2026) is simply to perfectly mimic the so_scan library
+%
+% Reference is `docs/sophia_syntax.md` as well as `src/so_scan_lib.erl` in
+% original sophia lib
+%
+% From docs/sophia_syntax.md:
+%
+% - Id = [a-z_][A-Za-z0-9_']* identifiers start with a lower case letter.
+% - Con = [A-Z][A-Za-z0-9_]* constructors start with an upper case letter.
+% - QId = (Con\.)+Id qualified identifiers (e.g. `Map.member`)
+% - QCon = (Con\.)+Con qualified constructor
+% - TVar = 'Id type variable (e.g `'a`, `'b`)
+% - Int = [0-9]+(_[0-9]+)*|0x[0-9A-Fa-f]+(_[0-9A-Fa-f]+)* integer literal with optional `_` separators
+% - Bytes = #[0-9A-Fa-f]+(_[0-9A-Fa-f]+)* byte array literal with optional `_` separators
+% - String` string literal enclosed in " with escape character `\`
+% - Char character literal enclosed in ' with escape character `\`
+% - AccountAddress base58-encoded 32 byte account pubkey with `ak_` prefix
+% - ContractAddress base58-encoded 32 byte contract address with `ct_` prefix
+% - Signature base58-encoded 64 byte cryptographic signature with `sg_` prefix
+%
+% Sophia's notion of tokens also includes keywords, parens, whitespace, etc.
+% Real reference is of course the code:
+%
+%       Number   = fun(Digit) -> [Digit, "+(_", Digit, "+)*"] end,
+%       DIGIT    = "[0-9]",
+%       HEXDIGIT = "[0-9a-fA-F]",
+%       LOWER    = "[a-z_]",
+%       UPPER    = "[A-Z]",
+%       CON      = [UPPER, "[a-zA-Z0-9_]*"],
+%       INT      = Number(DIGIT),
+%       HEX      = ["0x", Number(HEXDIGIT)],
+%       BYTES    = ["#", Number(HEXDIGIT)],
+%       WS       = "[\\000-\\ ]+",
+%       ID       = [LOWER, "[a-zA-Z0-9_']*"],
+%       TVAR     = ["'", ID],
+%       QID      = ["(", CON, "\\.)+", ID],
+%       QCON     = ["(", CON, "\\.)+", CON],
+%       OP       = "[=!<>+\\-*/:&|?~@^]+",
+%       %% Five cases for a character
+%       %%  * 1 7-bit ascii, not \ or '
+%       %%  * 2-4 8-bit values (UTF8)
+%       %%  * \ followed by a known modifier [aernrtv]
+%       %%  * \xhh
+%       %%  * \x{hhh...}
+%       CHAR     = "'(([\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f])|([\\x00-\\xff][\\x80-\\xff]{1,3})|(\\\\[befnrtv'\\\\])|(\\\\x[0-9a-fA-F]{2,2})|(\\\\x\\{[0-9a-fA-F]*\\}))'",
+%       STRING   = "\"([^\"\\\\]|(\\\\.))*\"",
+%
+%       CommentStart = {"/\\*", push(comment, skip())},
+%       CommentRules =
+%           [ CommentStart
+%           , {"\\*/",        pop(skip())}
+%           , {"[^/*]+|[/*]", skip()} ],
+%
+%       Keywords = ["contract", "include", "let", "switch", "type", "record", "datatype", "if", "elif", "else", "function",
+%                   "stateful", "payable", "true", "false", "mod", "public", "entrypoint", "private", "indexed", "namespace",
+%                   "interface", "main", "using", "as", "for", "hiding", "band", "bor", "bxor", "bnot"
+%                  ],
+%       KW = string:join(Keywords, "|"),
+%
+% There is a lot going on in that code. This is purely the part that matches
+% strings specifically, . The *tokenizer* (sfc_tokenizer) knows the hierarchy
+% of sophia tokens (e.g. it knows to match keywords before identifiers, so that
+% `contract` gets tokenized as a keyword and not a variable name), and then
+% calls into this module in order to match the string shape it's looking for.
+% @end
+-module(sfc_strmatch).
+
+%-compile([export_all, nowarn_export_all]).
+
+
+-export_type([
+    string_matcher/0
+]).
+
+% given a string matcher and a string, determine match or no
+-export([
+    match/2
+]).
+
+% string matchers for sophia token shapes
+-export([
+    smr_sf_ws/0,
+    smr_sf_op/0,
+    smr_sf_punct/0,
+    smr_sf_id/0,
+    smr_sf_con/0,
+    smr_sf_qid/0,
+    smr_sf_qcon/0,
+    smr_sf_tvar/0,
+    smr_sf_int16/0,
+    smr_sf_int10/0,
+    smr_sf_bytes/0,
+    smr_sf_str/0,
+    smr_sf_char/0,
+    smr_sf_ak/0,
+    smr_sf_ct/0,
+    smr_sf_sg/0
+]).
+
+
+% regex primitives/combinators
+-export([
+    % plumbing
+    smr_char/1,
+    smr_char_range/2,
+    smr_union/1,
+    smr_seq/1,
+    smr_plus/1,
+    smr_star/1,
+    smr_dot/0,
+    smr_ncmatch/2,
+    % porcelain
+    smr_string/1,
+    smr_oneofchars/1
+]).
+
+
+
+%%=======================================================================
+%% API: Types
+%%=======================================================================
+
+-type string_matcher()
+    :: {smr_char, integer()}                        % /a/, /b/, /cd/
+     | {smr_char_range, integer(), integer()}       % /[a-z]/
+     | {smr_union, [string_matcher()]}              % /[abc]/
+     | {smr_seq,   [string_matcher()]}              % /abc/
+     | {smr_plus,  string_matcher()}                % /(abc)+/
+     | {smr_star,  string_matcher()}                % /(abc)*/
+     | smr_dot                                      % /./
+       % negative conditional match
+       % /[^a-z]/, but more general
+       % /[^a-z]/ <~> smr_ncmatch(smr_char_range($a, $z), smr_dot()).
+     | {smr_ncmatch, MustNotMatch :: string_matcher(),
+                     Match        :: string_matcher()}.
+
+
+%=========================================================
+% API: Functions
+%=========================================================
+
+%---------------------------------------------------------
+% API: string matching logic
+%
+% -export([
+%     match/2
+% ]).
+%---------------------------------------------------------
+
+-spec match(Matcher, Source) -> MaybeMatch
+    when Matcher    :: string_matcher(),
+         Source     :: iolist(),
+         MaybeMatch :: {strmatch, Matched :: string(), Rest :: string()}
+                     | no_strmatch.
+% @doc
+% normalize input to an nfc list before parsing
+%
+%   match(Matcher, Source) ->
+%       string_match(Matcher, unicode:characters_to_nfc_list(Source)).
+% @end
+match(Matcher, Source) ->
+    string_match(Matcher, unicode:characters_to_nfc_list(Source)).
+
+
+%---------------------------------------------------------
+% API: string matchers for sophia tokens
+%
+% -export([
+%    smr_sf_ws/0,
+%    smr_sf_op/0,
+%    smr_sf_punct/0,
+%    smr_sf_id/0,
+%    smr_sf_con/0,
+%    smr_sf_qid/0,
+%    smr_sf_qcon/0,
+%    smr_sf_tvar/0,
+%    smr_sf_int16/0,
+%    smr_sf_int10/0,
+%    smr_sf_bytes/0,
+%    smr_sf_str/0,
+%    smr_sf_char/0,
+%    smr_sf_ak/0,
+%    smr_sf_ct/0,
+%    smr_sf_sg/0
+% ]).
+%---------------------------------------------------------
+
+-spec smr_sf_ws() -> string_matcher().
+% @doc
+% String matcher for whitespace
+%
+% from so_scan.erl (9.0.0)
+%
+%    WS       = "[\\000-\\ ]+",
+%
+% turns out all the ascii codepoints which are 32 or lower are control chars or
+% whitespace: https://www.asciitable.com/
+% @end
+smr_sf_ws() ->
+    WhitespaceChars = lists:seq(0, 32),
+    smr_plus(smr_oneofchars(WhitespaceChars)).
+
+
+
+-spec smr_sf_op() -> string_matcher().
+% @doc
+% String matcher for a sophia operator
+%
+% from so_scan.erl (9.0.0)
+%
+%    OP       = "[=!<>+\\-*/:&|?~@^]+",
+% @end
+smr_sf_op() ->
+    SfOpChars = "=!<>+-*/:&|?~@^",
+    SfOpChar = smr_union([smr_char(C) || C <- SfOpChars]),
+    smr_plus(SfOpChar).
+
+
+
+-spec smr_sf_punct() -> string_matcher().
+% @doc
+% String matcher for parens/braces
+%
+% from so_scan.erl (9.0.0)
+%
+%       , {"\\.\\.|[,.;()\\[\\]{}]", symbol()}
+% @end
+
+smr_sf_punct() ->
+    M_DotDotOp = smr_string(".."),
+    M_PunctChars = smr_oneofchars(",.;()[]{}"),
+    smr_union([M_DotDotOp, M_PunctChars]).
+
+
+
+-spec smr_sf_id() -> string_matcher().
+% @doc
+% String matcher for a sophia identifier
+%
+%   foo
+%   _foo
+%   fooBar'
+%
+% - Id = [a-z_][A-Za-z0-9_']* identifiers start with a lower case letter.
+% @end
+
+smr_sf_id() ->
+    % upper lower digit under quote
+    ULDUQ =
+        {smr_union, [{smr_char_range, $A, $Z},
+                     {smr_char_range, $a, $z},
+                     {smr_char_range, $0, $9},
+                     {smr_char, $_},
+                     {smr_char, $'}]},
+    smr_seq([smr_union([smr_char_range($a, $z), smr_char($_)]),
+             smr_star(ULDUQ)]).
+
+
+
+-spec smr_sf_con() -> string_matcher().
+% @doc
+% String matcher for a sophia constructor name
+%
+%   Foo
+%   Foo_Bar
+%   Foo_Bar3_'
+%
+% - Con = [A-Z][A-Za-z0-9_']* constructors start with an upper case letter.
+% @end
+
+smr_sf_con() ->
+    ULDU =
+        {smr_union, [{smr_char_range, $A, $Z},
+                     {smr_char_range, $a, $z},
+                     {smr_char_range, $0, $9},
+                     {smr_char, $_}]},
+    smr_seq([smr_char_range($A, $Z),
+             smr_star(ULDU)]).
+
+
+
+-spec smr_sf_qid() -> string_matcher().
+
+% @doc
+% String matcher for a Sophia qualified identifier
+%
+%   Foo.Bar.Baz.quux
+%
+% - QId = (Con\.)+Id qualified identifiers (e.g. `Map.member`)
+% @end
+smr_sf_qid() ->
+    Qualifier = smr_seq([smr_sf_con(), smr_char($.)]),
+    Qualifiers = smr_plus(Qualifier),
+    Identifier = smr_sf_id(),
+    smr_seq([Qualifiers, Identifier]).
+
+
+
+-spec smr_sf_qcon() -> string_matcher().
+
+% @doc
+%
+% String matcher for a sophia qualified constructor
+%
+%   Foo.Bar.Baz
+%
+% - QCon = (Con\.)+Con qualified constructor
+% @end
+smr_sf_qcon() ->
+    Qualifier = smr_seq([smr_sf_con(), smr_char($.)]),
+    Qualifiers = smr_plus(Qualifier),
+    Constructor = smr_sf_con(),
+    smr_seq([Qualifiers, Constructor]).
+
+
+
+-spec smr_sf_tvar() -> string_matcher().
+
+% @doc
+% String matcher for a sophia type variable; e.g.
+%
+%   'a
+%   'foo_bar
+%
+% - TVar = 'Id type variable (e.g `'a`, `'b`)
+% @end
+smr_sf_tvar() ->
+    smr_seq([smr_char($'), smr_sf_id()]).
+
+
+
+-spec smr_sf_int16() -> string_matcher().
+
+% @doc
+% String matcher for a sophia base16 integer 0xDEAD_BEEF
+%
+% so_scan parses base10/base16 in one go, but i think it's clearer if they're
+% different
+%
+% - Int = [0-9]+(_[0-9]+)*|0x[0-9A-Fa-f]+(_[0-9A-Fa-f]+)* integer literal with optional `_` separators
+% @end
+smr_sf_int16() ->
+    HexDigit   = smr_union([smr_char_range($0, $9),
+                            smr_char_range($A, $F),
+                            smr_char_range($a, $f)]),
+    HexDigits  = smr_plus(HexDigit),
+    UHexDigits = smr_seq([smr_char($_), HexDigits]),
+    smr_seq([smr_string("0x"), HexDigits, smr_star(UHexDigits)]).
+
+
+
+-spec smr_sf_int10() -> string_matcher().
+
+% @doc
+% string matcher for a sophia base 10 int 012_345_6_7
+%
+% so_scan parses base10/base16 in one go, but i think it's clearer if they're
+% different
+%
+% - Int = [0-9]+(_[0-9]+)*|0x[0-9A-Fa-f]+(_[0-9A-Fa-f]+)* integer literal with optional `_` separators
+% @end
+smr_sf_int10() ->
+    DecDigit   = smr_char_range($0, $9),
+    DecDigits  = smr_plus(DecDigit),
+    UDecDigits = smr_seq([smr_char($_), DecDigits]),
+    smr_seq([DecDigits, smr_star(UDecDigits)]).
+
+
+
+-spec smr_sf_bytes() -> string_matcher().
+
+% @doc
+% String matcher for a sophia bytestring
+%
+%   #DEAD_BEEF
+%
+% - Bytes = #[0-9A-Fa-f]+(_[0-9A-Fa-f]+)* byte array literal with optional `_` separators
+% @end
+smr_sf_bytes() ->
+    HexDigit   = smr_union([smr_char_range($0, $9),
+                            smr_char_range($A, $F),
+                            smr_char_range($a, $f)]),
+    HexDigits  = smr_plus(HexDigit),
+    UHexDigits = smr_seq([smr_char($_), HexDigits]),
+    smr_seq([smr_char($#), HexDigits, smr_star(UHexDigits)]).
+
+
+
+-spec smr_sf_str() -> string_matcher().
+
+% @doc
+% String matcher for sophia string literal
+%
+% String string literal enclosed in " with escape character `\`
+%
+%  STRING   = "\"([^\"\\\\]|(\\\\.))*\"",
+% @end
+smr_sf_str() ->
+    smr_seq([smr_char($"), smr_star(smr_sf_strchar()), smr_char($")]).
+
+
+-spec smr_sf_strchar() -> string_matcher().
+% @private
+% string matcher for a character in a sophia string
+%
+%  STRING   = "\"([^\"\\\\]|(\\\\.))*\"",
+%
+% this is for
+%
+%  ([^\"\\\\]|(\\\\.))
+%
+% cleaned up:
+%
+%   ([^"\\]|(\\.))
+% @end
+smr_sf_strchar() ->
+    % cannot have a literal newline in string
+    %
+    %   "foo
+    %   bar"
+    %
+    % is not a valid sophia string
+    AnythingButNewline = smr_ncmatch(smr_char($\n), smr_dot()),
+    IsASpecialChar = smr_union([smr_char($"), smr_char($\\)]),
+    NotEscSeq      = smr_ncmatch(IsASpecialChar, AnythingButNewline),
+    % FIXME: maybe we should enfore escape sequence rules here?
+    %
+    % especially to be consistent with char rules
+    EscSeq         = smr_seq([smr_char($\\), AnythingButNewline]),
+    smr_union([NotEscSeq, EscSeq]).
+
+
+
+-spec smr_sf_char() -> string_matcher().
+% @doc
+% String matcher for a Sophia char literal
+%
+% From so_scan.erl:
+%
+%       %% Five cases for a character
+%       %%  * 1 7-bit ascii, not \ or '
+%       %%  * 2-4 8-bit values (UTF8)
+%       %%  * \ followed by a known modifier [aernrtv]
+%       %%  * \xhh
+%       %%  * \x{hhh...}
+%       CHAR     = "'(([\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f])|([\\x00-\\xff][\\x80-\\xff]{1,3})|(\\\\[befnrtv'\\\\])|(\\\\x[0-9a-fA-F]{2,2})|(\\\\x\\{[0-9a-fA-F]*\\}))'",
+%
+% > Char character literal enclosed in ' with escape character `\`
+% @end
+
+% ok we get this monstrosity
+%
+%   "'(([\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f])|([\\x00-\\xff][\\x80-\\xff]{1,3})|(\\\\[befnrtv'\\\\])|(\\\\x[0-9a-fA-F]{2,2})|(\\\\x\\{[0-9a-fA-F]*\\}))'"
+%
+% there's like 4 levels of escaping and shit, so let's break it down. First
+% let's notice this pattern:
+%
+%   '(...)'.
+%
+% So let's make a hole
+smr_sf_char() ->
+    smr_seq([smr_char($'), smr_sf_char_inner(), smr_char($')]).
+
+% smr_sf_char_inner() will deal with the stuff in the monstrosity
+%
+% we had this before
+%   "'(([\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f])|([\\x00-\\xff][\\x80-\\xff]{1,3})|(\\\\[befnrtv'\\\\])|(\\\\x[0-9a-fA-F]{2,2})|(\\\\x\\{[0-9a-fA-F]*\\}))'"
+%
+% let's trim
+%   ([\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f])|([\\x00-\\xff][\\x80-\\xff]{1,3})|(\\\\[befnrtv'\\\\])|(\\\\x[0-9a-fA-F]{2,2})|(\\\\x\\{[0-9a-fA-F]*\\})
+%
+% and reorg
+%     ([\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f])
+%   | ([\\x00-\\xff][\\x80-\\xff]{1,3})
+%   | (\\\\[befnrtv'\\\\])
+%   | (\\\\x[0-9a-fA-F]{2,2})
+%   | (\\\\x\\{[0-9a-fA-F]*\\})
+%
+% trim some more
+%     [\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f]
+%   | [\\x00-\\xff][\\x80-\\xff]{1,3}
+%   | \\\\[befnrtv'\\\\]
+%   | \\\\x[0-9a-fA-F]{2,2}
+%   | \\\\x\\{[0-9a-fA-F]*\\}
+%
+% undo some escapes
+%     [\x00-\x26\x28-\x5b\x5d-\x7f]
+%   | [\x00-\xff][\x80-\xff]{1,3}
+%   | \\[befnrtv'\\]
+%   | \\x[0-9a-fA-F]{2,2}
+%   | \\x\{[0-9a-fA-F]*\}
+%
+% rewrite
+%   [^'\]                    <~> (16#00..16#26 | 16#28..16#5b | 16#5d..16#7f)
+%   <<_:8, (_ >= 128){1,3}>> <~> [\x00-\xff][\x80-\xff]{1,3}
+%   <<$\\, X>>               <~> \\[befnrtv'\\]
+%   \xAB                     <~> \\x[0-9a-fA-F]{2,2}
+%   \x{DEADBEEF}             <~> \\x\{[0-9a-fA-F]*\}
+
+smr_sf_char_inner() ->
+    Escapable = smr_oneofchars("befnrtv'\\"),
+    EscSeq    = smr_seq([smr_char($\\), Escapable]),
+    HexChar  = smr_oneofchars("0123456789ABCDEFabcdef"),
+    HexEsc2  = smr_seq([smr_string("\\x"), HexChar, HexChar]),
+    HexEsc   = smr_seq([smr_string("\\x{"), smr_star(HexChar), smr_char($})]),
+    % FIXME: possible erroneous oversimplification here
+    QuoteOrBackslash = smr_oneofchars([$', $\\]),
+    Utf8Char = smr_ncmatch(QuoteOrBackslash, smr_dot()),
+    smr_union([EscSeq, HexEsc2, HexEsc, Utf8Char]).
+
+
+
+-spec smr_sf_ak() -> string_matcher().
+% @doc
+% string matcher for
+%
+%   ak_....
+%
+% sophia's tokenizer tokenizes ak_.../sg_... etc as identifiers and then in the
+% parsing stage disambiguates them
+%
+% i don't like that, but for version 0.1 we're going to match the behavior of
+% `so_scan` exactly, just for clarity
+%
+% however, note that is the token step, we can still write a string matcher to
+% be useful later
+%
+% > AccountAddress base58-encoded 32 byte account pubkey with `ak_` prefix
+% @end
+smr_sf_ak() ->
+    smr_apistr58("ak").
+
+
+
+-spec smr_sf_ct() -> string_matcher().
+% @doc
+% string matcher for
+%
+%   ct_....
+%
+% sophia's tokenizer tokenizes ak_.../sg_... etc as identifiers and then in the
+% parsing stage disambiguates them
+%
+% i don't like that, but for version 0.1 we're going to match the behavior of
+% `so_scan` exactly, just for clarity
+%
+% however, note that is the token step, we can still write a string matcher to
+% be useful later
+%
+% > ContractAddress base58-encoded 32 byte contract address with `ct_` prefix
+% @end
+smr_sf_ct() ->
+    smr_apistr58("ct").
+
+
+
+-spec smr_sf_sg() -> string_matcher().
+% @doc
+% string matcher for
+%
+%   sg_....
+%
+% sophia's tokenizer tokenizes ak_.../sg_... etc as identifiers and then in the
+% parsing stage disambiguates them
+%
+% i don't like that, but for version 0.1 we're going to match the behavior of
+% `so_scan` exactly, just for clarity
+%
+% however, note that is the token step, we can still write a string matcher to
+% be useful later
+%
+% > Signature base58-encoded 64 byte cryptographic signature with `sg_` prefix
+% @end
+smr_sf_sg() ->
+    smr_apistr58("sg").
+
+
+-spec smr_apistr58(Prefix) -> string_matcher()
+    when Prefix :: string().
+% @private
+% string matcher for
+%
+%   ak_...
+%   ct_...
+%   sg_...
+%
+% prefix is given as arg
+%
+% ... are base58 chars
+% @end
+smr_apistr58(Prefix) ->
+    smr_seq([smr_string(Prefix), smr_char($_), smr_plus(smr_base58char())]).
+
+
+smr_base58char() ->
+    smr_oneofchars("123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz").
+
+
+
+
+
+
+%---------------------------------------------------------
+% API: string matcher primitive constructors
+%---------------------------------------------------------
+
+-spec smr_char(Char) -> string_matcher()
+    when Char :: integer().
+% @doc
+% string matcher for a specific char
+%
+%   /[abc]/ <~> smr_union([smr_char($a), smr_char($b), smr_char($c)])
+%
+% @end
+smr_char(X) when is_integer(X) ->
+    {smr_char, X}.
+
+
+
+-spec smr_char_range(LowerBound, UpperBound) -> string_matcher()
+    when LowerBound :: integer(),
+         UpperBound :: integer().
+% @doc
+% string matcher for a range of characters
+%
+%   /[a-z]/ <~> smr_char_range($a, $z)
+%   /[0-9]/ <~> smr_char_range($0, $9)
+% @end
+smr_char_range(X, Y) when is_integer(X), is_integer(Y) ->
+    {smr_char_range, X, Y}.
+
+
+
+-spec smr_union(StringMatchers) -> string_matcher()
+    when StringMatchers :: [StringMatcher],
+         StringMatcher  :: string_matcher().
+% @doc
+% String matcher that matches on the first matcher given that matches
+%
+%   /[abc]/     <~> smr_union([smr_char($a), smr_char($b), smr_char($c)])
+%   /(foo|bar)/ <~> smr_union([smr_string("foo"), smr_string("bar")])
+% @end
+smr_union(List) when is_list(List) ->
+    {smr_union, List}.
+
+
+
+-spec smr_seq(StringMatchers) -> string_matcher()
+    when StringMatchers :: [string_matcher()].
+% @doc
+% Match a sequence of matchers
+%
+%   /abc/ <~> smr_seq([smr_char($a), smr_char($b), smr_char($c)])
+%
+% smr_string/1 just maps to a sequence of chars
+% @end
+smr_seq(List) when is_list(List) ->
+    {smr_seq, List}.
+
+
+
+-spec smr_plus(Matcher) -> string_matcher()
+    when Matcher :: string_matcher().
+% @doc
+% "one or more of"; like the `+` operator in regexes.
+%
+%   sm_plus(SMR, Src0) ->
+%       case string_match(SMR, Src0) of
+%           {strmatch, Str, Src1} -> sm_star(SMR, Str, Src1);
+%           no_strmatch               -> no_strmatch
+%       end.
+% @end
+smr_plus(SMR) ->
+    {smr_plus, SMR}.
+
+
+
+-spec smr_star(Matcher) -> string_matcher()
+    when Matcher :: string_matcher().
+% @doc
+% "zero or more of"; like the `*` operator in regexes.
+%
+%   sm_star(SMR, Acc, Src0) ->
+%       case string_match(SMR, Src0) of
+%           % 0
+%           no_strmatch ->
+%               {strmatch, unicode:characters_to_list(Acc), Src0};
+%           % or more
+%           {strmatch, Str, Src1} ->
+%               sm_star(SMR, [Acc, Str], Src1)
+%       end.
+% @end
+smr_star(SMR) ->
+    {smr_star, SMR}.
+
+
+
+-spec smr_dot() -> string_matcher().
+% @doc
+% matches every character; analogous to /./
+%
+% string_match(smr_dot, SrcStr) ->
+%     case SrcStr of
+%         [C | Rest] -> {strmatch, [C], Rest};
+%         []         -> no_strmatch
+%     end;
+% @end
+smr_dot() ->
+    smr_dot.
+
+
+
+-spec smr_ncmatch(MustNotMatch, Match) -> string_matcher()
+    when MustNotMatch :: string_matcher(),
+         Match        :: string_matcher().
+% @doc
+% Negative conditional match; analogous to `[^abc]` but more flexible
+%
+%
+%   /[^abc]/ <-> smr_ncmatch(smr_union([smr_char($a), smr_char($b), smr_char($c)]),
+%                            smr_dot()).
+%
+%
+%   string_match({smr_ncmatch, MustNotMatch, Match}, SrcStr) ->
+%       case string_match(MustNotMatch, SrcStr) of
+%           no_strmatch -> string_match(Match, SrcStr);
+%           _       -> no_strmatch
+%       end.
+%
+% @end
+smr_ncmatch(A, B) ->
+    {smr_ncmatch, A, B}.
+
+
+
+%---------------------------------------------------------
+% string matcher helpers
+%---------------------------------------------------------
+
+-spec smr_string(Chars) -> string_matcher()
+    when Chars :: string().
+% @doc
+% matches chars given in sequence; basically like putting the string in raw in
+% a regex
+%
+%   /foo/ <~> smr_string("foo")
+%         <~> smr_seq([smr_char($f), smr_char($o), smr_char($o)])
+%
+% rewrite over smr_seq/1 and smr_char/1
+%
+%   smr_string(String) when is_list(String) ->
+%       smr_seq([smr_char(C) || C <- String]).
+% @end
+smr_string(String) when is_list(String) ->
+    smr_seq([smr_char(C) || C <- String]).
+
+
+
+-spec smr_oneofchars(Chars) -> UnionMatcher
+    when Chars        :: string(),
+         UnionMatcher :: string_matcher().
+% @doc
+% String matcher for one of chars
+%
+%  /[abc]/ <~> smr_costring("abc")
+%          <~> smr_union([smr_char($f), smr_char($o), smr_char($o)])
+%
+% this is the dual of smr_string/1. string puts chars in sequence, this puts
+% chars in parallel.
+%
+% "costring" nomenclature is chosen specifically to annoy craig
+%
+% if you fix your stupid url schema i will consider changing this name
+%
+% the thing is though this is actually a good name, your url schema is just...
+% well you know it's compact, so you have amazon beat. no page-long urls for
+% gajumarket
+%
+% you know what, we're keeping both names
+%
+% i'm confusing myself, renaming to "oneofchars"
+% @end
+smr_oneofchars(Chars) ->
+    smr_union([smr_char(C) || C <- Chars]).
+
+
+%%=======================================================================
+%% INTERNALS: string matching logic
+%%=======================================================================
+
+
+-spec string_match(Matcher, Source) -> MaybeMatch
+    when Matcher    :: string_matcher(),
+         Source     :: string(),
+         MaybeMatch :: {strmatch, Matched :: string(), Rest :: string()}
+                     | no_strmatch.
+% @private
+% See if the source matches the given matcher; returns
+%
+%   %% NOTIONAL code
+%   string_match(/[abc]/, "abc") ->
+%       {strmatch, "a", "bc"}
+%   string_match(/[abc]/, "def") ->
+%       no_strmatch
+% @end
+string_match({smr_char, C}, SrcStr) ->
+    case SrcStr of
+        [X | Rest] when X =:= C -> {strmatch, [C], Rest};
+        _                       -> no_strmatch
+    end;
+string_match({smr_char_range, X, Y}, Src0) ->
+    case Src0 of
+        [C | Src1] when X =< C, C =< Y -> {strmatch, [C], Src1};
+        _                              -> no_strmatch
+    end;
+string_match({smr_union, SMRs}, Src0) ->
+    sm_union(SMRs, Src0);
+string_match({smr_seq, SMRs}, Src0) ->
+    sm_seq(SMRs, [], Src0);
+string_match({smr_plus, SMR}, Src0) ->
+    sm_plus(SMR, Src0);
+string_match({smr_star, SMR}, Src0) ->
+    sm_star(SMR, [], Src0);
+string_match(smr_dot, SrcStr) ->
+    case SrcStr of
+        [C | Rest] -> {strmatch, [C], Rest};
+        []         -> no_strmatch
+    end;
+string_match({smr_ncmatch, MustNotMatch, Match}, SrcStr) ->
+    case string_match(MustNotMatch, SrcStr) of
+        no_strmatch -> string_match(Match, SrcStr);
+        _           -> no_strmatch
+    end.
+
+
+% @private union must match *one* thing
+sm_union([SMR | SMRs], Src0) ->
+    case string_match(SMR, Src0) of
+        no_strmatch -> sm_union(SMRs, Src0);
+        Match       -> Match
+    end;
+sm_union([], _) ->
+    no_strmatch.
+
+
+% @private sequence must match *EACH* thing
+sm_seq([SMR | SMRs], Acc, Src0) ->
+    case string_match(SMR, Src0) of
+        {strmatch, Str, Src1} -> sm_seq(SMRs, [Acc, Str], Src1);
+        no_strmatch           -> no_strmatch
+    end;
+sm_seq([], Acc, Src) ->
+    {strmatch, unicode:characters_to_list(Acc), Src}.
+
+
+% @private plus matches at least one
+sm_plus(SMR, Src0) ->
+    case string_match(SMR, Src0) of
+        {strmatch, Str, Src1} -> sm_star(SMR, Str, Src1);
+        no_strmatch           -> no_strmatch
+    end.
+
+
+% @private star matches 0 or more
+sm_star(SMR, Acc, Src0) ->
+    case string_match(SMR, Src0) of
+        % 0
+        no_strmatch ->
+            {strmatch, unicode:characters_to_list(Acc), Src0};
+        % or more
+        {strmatch, Str, Src1} ->
+            sm_star(SMR, [Acc, Str], Src1)
+    end.
+
@@ -0,0 +1,790 @@
+% @doc
+% Ref: so_scan.erl
+%
+% This file contains a sophia tokenizer written in straightforward erlang with data
+% types that are sane.
+%
+% For MVP it mimics the behavior of so_scan exactly, in terms of like what its
+% definition of a token is and so on.
+%
+% sfc_so_scan.erl contains a compatibility layer that should agree with so_scan
+% exactly. It converts the data types here to the shapes that so_scan outputs.
+%
+% This is for two reasons:
+%
+% 1. in order to enable testing the two modules against each other, and
+% 2. to future-proof in case we decide to incrementally incorporate the sfc
+%    code into the legacy sophia compiler
+% @end
+-module(sfc_tokens).
+
+% meta
+-export([
+    token_types_parse_order/0,
+    kwds/0
+]).
+
+-export([
+    take_while/2,
+    take_while/3,
+    take_block/1,
+    take_block_item/1,
+    strings/2,
+    slurp_plist/1
+]).
+
+% token slurping
+-export([
+    indent_level/1,
+    is_significant/1,
+    filter_significant/1,
+    significant_tokens/1,
+    tokens/1,
+    slurp_token/2,
+    slurp_token_types/3,
+    slurp_token_of_type/3,
+    new_pos/2
+]).
+
+-include("$sfc_include/sfc.hrl").
+
+
+%=======================================================
+% API: functions
+%=======================================================
+
+-spec strings(N, Tokens) -> AtMostNStrings
+    when N              :: non_neg_integer(),
+         Tokens         :: [sfc_token()],
+         AtMostNStrings :: [string()].
+% @doc return the strings of the first N tokens
+
+strings(N, [#sfc_token{string = S} | Rest]) when is_integer(N), N >= 1 ->
+    [S | strings(N-1, Rest)];
+strings(_, []) ->
+    [];
+strings(0, _) ->
+    [].
+
+
+% used by parser
+%
+% a block is a column-delimited list of block items
+%
+% BLOCK =
+%   foo
+%       ...
+%   bar
+%       ...
+%   baz
+%       ...
+%
+% BLOCK_ITEM =
+%   foo
+%       ...
+
+-spec take_block(Tokens) -> {BlockTokens, Rest}
+    when Tokens :: [sfc_token()],
+         BlockTokens :: Tokens,
+         Rest :: Tokens.
+% @doc
+% takes all tokens whose column position is >= the column position of
+% the head token
+
+take_block([H = #sfc_token{pos = {_, BlkCol}} | T]) ->
+    TokenInBlock =
+        fun(#sfc_token{pos = {_, TkCol}}) ->
+            BlkCol =< TkCol
+        end,
+    take_while(TokenInBlock, [H], T);
+take_block([]) ->
+    {[], []}.
+
+
+
+-spec take_block_item(Tokens) -> {ItemTokens, Rest}
+    when Tokens :: [sfc_token()],
+         ItemTokens :: Tokens,
+         Rest :: Tokens.
+% @doc
+% takes all tokens whose column position is > the column position of
+% the head token
+
+take_block_item([H = #sfc_token{pos = {_, ItemCol}} | T]) ->
+    TokenInItem =
+        fun(#sfc_token{pos = {_, TkCol}}) ->
+            ItemCol < TkCol
+        end,
+    take_while(TokenInItem, [H], T);
+take_block_item([]) ->
+    {[], []}.
+
+
+
+-spec slurp_plist(Tokens) -> Result
+    when Tokens    :: [Token],
+         Result    :: {slurp, PList :: Tokens, After :: Tokens}
+                    | {error, Mismatch},
+         Mismatch  :: {fixme, mismatch, OpenStack, ClosedBy},
+         OpenStack :: Tokens,
+         ClosedBy  :: none | {value, Token},
+         Token     :: sfc_token().
+
+% @doc
+% the verbiage here is `slurp' rather than `take' because we insist on
+% delimiter matching.
+%
+% typical happy path:
+%   "(foo, bar) baz" ~> {slurp, "(foo, bar)", "baz"}
+%   "() baz"         ~> {slurp, "()",         "baz"}
+%   "foo () baz"     ~> {slurp, "",           "foo () baz"}
+%   "(foo, bar) baz" ~> {slurp, "(foo, bar)", "baz"}
+%
+% typical sad path:
+%   "(foo, bar]"  ~> {mismatch, ["("],      {value, "]"}}
+%   "(foo, bar"   ~> {mismatch, ["("],      none}
+%   "([foo, bar)" ~> {mismatch, ["[", "("], {value, ")"}}
+%
+% counterintuitive:
+%   "[foo, bar) baz"     ~> {slurp, "", "[foo, bar) baz"}
+%   "~!!\inv4l1d syntax" ~> {slurp, "", "~!!\inv4l1d syntax"}
+%   "(foo, bar)(baz)"    ~> {slurp, "(foo bar)", "(baz)"}
+%
+% the only "syntax checking" occurring is making sure the delimiter
+% stack pushes and pops properly
+%
+% please note that on mismatch, the list of open delimiters is
+% returned in STACK order, meaning the most recent open delimiters
+% first. this is more convenient for programs, but might be
+% counterintuitive to end-users (who are programmers, entirely
+% unfamiliar with notions like stacks and open/close delimiters)
+
+slurp_plist([Hd = #sfc_token{string = "("} | Tl]) ->
+    slurp_dlist([Hd], [Hd], Tl);
+slurp_plist(Tks) ->
+    {slurp, [], Tks}.
+
+
+% happy terminal case: stack popped entirely
+slurp_dlist(All, [], NewTokens) ->
+    {slurp, lists:reverse(All), NewTokens};
+% WMA stack is nonempty
+% happy cases of opens getting popped
+slurp_dlist(All, [#sfc_token{string = "("}      | NewOpen],
+                 [#sfc_token{string = ")"} = Tk | NewTks]) ->
+    slurp_dlist([Tk | All], NewOpen, NewTks);
+slurp_dlist(All, [#sfc_token{string = "["}      | NewOpen],
+                 [#sfc_token{string = "]"} = Tk | NewTks]) ->
+    slurp_dlist([Tk | All], NewOpen, NewTks);
+slurp_dlist(All, [#sfc_token{string = "{"}      | NewOpen],
+                 [#sfc_token{string = "}"} = Tk | NewTks]) ->
+    slurp_dlist([Tk | All], NewOpen, NewTks);
+% happy: open delimiters getting pushed
+slurp_dlist(All, Opens, [#sfc_token{string = "("} = Tk | NewTks]) ->
+    slurp_dlist([Tk | All], [Tk | Opens], NewTks);
+slurp_dlist(All, Opens, [#sfc_token{string = "["} = Tk | NewTks]) ->
+    slurp_dlist([Tk | All], [Tk | Opens], NewTks);
+slurp_dlist(All, Opens, [#sfc_token{string = "{"} = Tk | NewTks]) ->
+    slurp_dlist([Tk | All], [Tk | Opens], NewTks);
+% sad: mismatch cases
+slurp_dlist(All, Opens, []) ->
+    {error, {fixme, mismatch, Opens, none}};
+slurp_dlist(All, Opens, [#sfc_token{string = "}"} = BadClose | _]) ->
+    {error, {fixme, mismatch, Opens, {value, BadClose}}};
+slurp_dlist(All, Opens, [#sfc_token{string = "]"} = BadClose | _]) ->
+    {error, {fixme, mismatch, Opens, {value, BadClose}}};
+slurp_dlist(All, Opens, [#sfc_token{string = ")"} = BadClose | _]) ->
+    {error, {fixme, mismatch, Opens, {value, BadClose}}};
+% general case: non-terminal token gets pushed
+slurp_dlist(All, Opens, [Tk | NewTks]) ->
+    slurp_dlist([Tk | All], Opens, NewTks).
+
+
+%-------------------------------------------------------
+% API: meta info
+%
+% This is parse order definition, list of keywords, etc
+%
+% -export([
+%     token_types_parse_order/0,
+%     kwds/0
+% ]).
+%-------------------------------------------------------
+
+-spec token_types_parse_order() -> [sfc_token_type()].
+% @doc
+% list of sophia tokens in parse order (if an earlier type matches, the later
+% type isn't even checked)
+%
+%
+%    Rules =
+%          %% Comments and whitespace
+%        [ CommentStart
+%        , {"//.*", skip()}
+%        , {WS,     skip()}
+%
+%          %% Special characters
+%        , {"\\.\\.|[,.;()\\[\\]{}]", symbol()}
+%
+%          %% Literals
+%        , {CHAR,   token(char,   fun parse_char/1)}
+%        , {STRING, token(string, fun parse_string/1)}
+%        , {HEX,    token(hex,    fun parse_hex/1)}
+%        , {INT,    token(int,    fun parse_int/1)}
+%        , {BYTES,  token(bytes,  fun parse_bytes/1)}
+%
+%          %% Identifiers (qualified first!)
+%        , {QID,   token(qid,  fun(S) -> string:tokens(S, ".") end)}
+%        , {QCON,  token(qcon, fun(S) -> string:tokens(S, ".") end)}
+%        , {TVAR,  token(tvar)}
+%        , override({ID, token(id)}, {KW, symbol()})    %% Keywords override identifiers. Need to
+%        , {CON, token(con)}                            %% use override to avoid lexing "lettuce"
+%                                                       %% as ['let', {id, "tuce"}].
+%          %% Operators
+%        , {OP, symbol()}
+%        ],
+% @end
+
+token_types_parse_order() ->
+    % written in this style to be maximally editable
+    lists:flatten([
+        % comments and whitespace
+        lcom, bcom, ws,
+        punct,
+        % literals
+        char, string, int16, int10, bytes,
+        ak, ct, sg,
+        % qualified names need to go ahead of unqualifieds
+        qid, qcon,
+        tvar,
+        % keywords need to be parsed ahead of ids
+        kwd, id,
+        con,
+        % ops [=, =>, >>], punctuation (parens/braces)
+        op
+    ]).
+
+
+-spec kwds() -> list(string()).
+% @doc list of sophia kwds
+
+kwds() ->
+    ["contract", "include", "let", "switch", "type", "record", "datatype",
+     "if", "elif", "else", "function", "stateful", "payable", "true", "false",
+     "mod", "public", "entrypoint", "private", "indexed", "namespace",
+     "interface", "main", "using", "as", "for", "hiding", "band", "bor",
+     "bxor", "bnot"].
+
+
+%-------------------------------------------------------
+% API: token slurping
+%
+% -export([
+%     tokens/1,
+%     slurp_token/1,
+%     slurp_token_types/2,
+%     slurp_token_of_type/2
+% ]).
+%-------------------------------------------------------
+
+% Token accessors
+-spec indent_level(sfc_token()) -> pos_integer().
+
+indent_level(#sfc_token{pos = {_, IndentLevel}}) ->
+    IndentLevel.
+
+
+
+-spec significant_tokens(SrcStr) -> Result
+    when SrcStr :: iolist(),
+         Result :: {ok, Tokens}
+                 | {error, sfc_err()},
+         Tokens :: [sfc_token()].
+
+significant_tokens(SrcStr) ->
+    case tokens(SrcStr) of
+        {ok, Tokens} ->
+            {ok, filter_significant(Tokens)};
+        Error ->
+            Error
+    end.
+
+
+
+-spec filter_significant(Tokens) -> SignificantTokens
+    when Tokens            :: [sfc_token()],
+         SignificantTokens :: Tokens.
+
+filter_significant(Tokens) ->
+    lists:filter(fun is_significant/1, Tokens).
+
+
+
+-spec is_significant(Token) -> boolean()
+    when Token :: sfc_token().
+
+is_significant(#sfc_token{type = bcom}) -> false;
+is_significant(#sfc_token{type = lcom}) -> false;
+is_significant(#sfc_token{type = ws})   -> false;
+is_significant(_)                           -> true.
+
+
+-spec tokens(SrcStr) -> Result
+    when SrcStr :: iolist(),
+         Result :: {ok, Tokens}
+                 | {error, sfc_err()},
+         Tokens :: [sfc_token()].
+% @doc
+% Recursively parse all tokens off the front end of the string. `Rest' is
+% the first tail of the string for which no token parser succeeds.
+%
+% Semantically, `Rest'` being nonempty amounts to the presence of an illegal
+% character.
+
+tokens(S) ->
+    % defensive normalization
+    tokens([], {1, 1}, unicode:characters_to_nfc_list(S)).
+
+tokens(Stack, _FinalPos, "") ->
+    {ok, lists:reverse(Stack)};
+tokens(Stack, Pos, SrcStr) ->
+    case slurp_token(Pos, SrcStr) of
+        {tokmatch, NewToken = #sfc_token{string = TokStr},
+                   NewSrcStr} ->
+            NewPos = new_pos(Pos, TokStr),
+            tokens([NewToken | Stack], NewPos, NewSrcStr);
+        no_tokmatch ->
+            PrevTokens = lists:reverse(Stack),
+            Err = #sfc_err_no_tokmatch{prev_tokens = PrevTokens,
+                                       break_pos   = Pos,
+                                       rest        = SrcStr},
+            {error, Err};
+        % FIXME so_scan bad
+        % this is so fucking stupid
+        % so_scan for some reason allows unterminated block comments at
+        % the end of files
+        %
+        % for now we're just going to agree with so_scan
+        {ierr, unterminated_block_comment} ->
+            PrevTokens = lists:reverse(Stack),
+            Err = #sfc_err_bcom_unterminated{prev_tokens = PrevTokens,
+                                             break_pos   = Pos,
+                                             rest        = SrcStr},
+            {error, Err};
+        Error = {error, _} ->
+            Error
+    end.
+
+% alright some bullshit here
+%
+% we're computing the line/column position of each string
+%
+% however this is meant to be compatible with so_scan, so it's a bit wonky
+% because regex list bullshit.
+%
+% recall that so_scan operates on the list representation of the utf-8 encoded
+% bytes; this is different than on a list of bignum codepoints (e.g.
+% unicode:characters_to_nfc_list(Bytes)); let's suppose some stupid complicated
+% foreign character which a sane language would simply criminalize has list
+% representation [ABC], but byte representation <<A,B,C>>
+%
+% as far as so_scan is concerned, this means the character ABC consumes 3
+% columns. the only exception is tab characters, which always fast-forward to
+% the next tab stop, which is 1-indexed because god hates all of us
+%
+% so the tab-stops are
+%   1 9 17 25 33 ...
+%
+% column position is determined in all cases by byte order, EXCEPT for $\t
+% which goes to the next tab stop
+%
+% so in general, for the token string, we need to convert to bytes first,
+% then handle `\t` bytes as a special case
+%
+% again in the tokenizer context, we're assuming that the input to our
+% tokenizer is an nfc-list which has a flat list of each unicode character in
+% codepoint form
+%
+% here we're just converting it to byte form, then computing columns based on
+% bytes
+new_pos(OldPos, TokStr) ->
+    new_pos_bytes(unicode:characters_to_binary(TokStr), OldPos).
+
+% newline just goes to {L+1, 1}
+new_pos_bytes(<<$\n:8, Rest/bytes>>, _Pos = {L, _}) ->
+    NewPos = {L+1, 1},
+    new_pos_bytes(Rest, NewPos);
+new_pos_bytes(<<$\t:8, Rest/bytes>>, _Pos = {Linum, Colnum1}) ->
+    % stinky wet sweaty robots need 1-based indexing
+    % so tab stops are at
+    %   1 9 17 25
+    % super awesome dry silicon robots use 0-based indexing
+    % so tab stops are at
+    %   0 8 16 25
+    Colnum0 = Colnum1 - 1,
+    % 0 based is based
+    NextTabstop0 = next_tabstop8(Colnum0),
+    NextTabstop1 = NextTabstop0 + 1,
+    NextPos      = {Linum, NextTabstop1},
+    new_pos_bytes(Rest, NextPos);
+new_pos_bytes(<<_:8, Rest/bytes>>, _Pos = {Linum, Colnum1}) ->
+    % in general advance by 1
+    new_pos_bytes(Rest, {Linum, Colnum1 + 1});
+new_pos_bytes(<<>>, FinalPos) ->
+    FinalPos.
+
+% 0     8       16      24      etc
+% 0*8   1*8     2*8     3*8     etc
+next_tabstop8(Col0) when Col0 >= 0 ->
+    % Col0 = PrevTabQ*8 + PrevTabR
+    PrevTabQ = Col0 div 8,
+    PrevTabR = Col0 rem 8,
+    Col0 = PrevTabQ*8 + PrevTabR,
+    NextTabQ = PrevTabQ + 1,
+    NextTabCol0 = NextTabQ*8,
+    NextTabCol0.
+
+%% copied from so_scan_lib.erl just to match behavior
+%-define(TAB_SIZE, 8).
+%
+%next_pos([], P) -> P;
+%next_pos([$\n | S], {L, _}) -> next_pos(S, {L + 1, 1});
+%next_pos([$\t | S], {L, C}) -> next_pos(S, {L, (C + ?TAB_SIZE - 1) div ?TAB_SIZE * ?TAB_SIZE + 1});
+%next_pos([_   | S], {L, C}) -> next_pos(S, {L, C + 1}).
+
+
+
+-spec slurp_token(Pos, SrcStr) -> Result
+    when Pos    :: sfc_pos(),
+         SrcStr :: string(),
+         Result :: {tokmatch, Token, Rest}
+                 | no_tokmatch
+                 | {error, sfc_err()}
+                 | {ierr, unterminated_block_comment},
+         Token  :: sfc_token(),
+         Rest   :: string().
+% @doc
+% grab a single token off the front of the string according to
+% `token_types_parse_order/0'
+
+slurp_token(Pos, SrcStr) ->
+    % this is the easiest format if i need to fuck with it
+    slurp_token_types(token_types_parse_order(), Pos, SrcStr).
+
+
+
+-spec slurp_token_types(ParseOrder, Pos, SrcStr) -> Result
+    when ParseOrder :: [sfc_token_type()],
+         Pos        :: sfc_pos(),
+         SrcStr     :: string(),
+         Result     :: {tokmatch, Token, Rest}
+                     | no_tokmatch
+                     | {error, sfc_err()}
+                     | {ierr, unterminated_block_comment},
+         Token      :: sfc_token(),
+         Rest       :: string().
+% @doc
+% grab a single token off the front of the string according to
+% `token_types_parse_order/0'
+
+slurp_token_types([TokenType | TTs], Pos, SrcStr) ->
+    case slurp_token_of_type(TokenType, Pos, SrcStr) of
+        Match = {tokmatch, _, _} -> Match;
+        no_tokmatch              -> slurp_token_types(TTs, Pos, SrcStr);
+        IErr  = {ierr, _}        -> IErr;
+        Error = {error, _}       -> Error
+    end;
+slurp_token_types([], _Pos, _SrcStr) ->
+    no_tokmatch.
+
+
+-spec slurp_token_of_type(TokenType, Pos, SrcStr) -> MaybeToken
+    when TokenType  :: sfc_token_type(),
+         Pos        :: sfc_pos(),
+         SrcStr     :: string(),
+         MaybeToken :: {tokmatch, Token, Rest}
+                     | no_tokmatch
+                     | {error, sfc_err()}
+                     | {ierr, unterminated_block_comment},
+         Token      :: sfc_token(),
+         Rest       :: string().
+% @doc
+% match a sophia token of a given type off the front of the string
+% @end
+
+% COMMENTS AND WHITESPACE: lcom, bcom, ws
+%
+% sophia line comment
+%
+% i am not going to bother writing a string matcher thing for this
+% FIXME: make a string matcher for line comments
+slurp_token_of_type(lcom, Pos, SrcStr) ->
+    case SrcStr of
+        "//" ++ _ ->
+            {Line, Rest} = takeline("", SrcStr),
+            Token = #sfc_token{type   = lcom,
+                                   pos    = Pos,
+                                   string = Line},
+            {tokmatch, Token, Rest};
+        _ ->
+            no_tokmatch
+    end;
+% Block comments cannot have a string matcher because they have a whole stack
+% thing keeping track of depth because of nested block comments
+slurp_token_of_type(bcom, Pos, SrcStr0) ->
+    case SrcStr0 of
+        "/*" ++ SrcStr1 ->
+            case bcom("/*", 1, SrcStr1) of
+                {ok, CommentStr, SrcStr2} ->
+                    Token = #sfc_token{type   = bcom,
+                                           pos    = Pos,
+                                           string = CommentStr},
+                    {tokmatch, Token, SrcStr2};
+                Error ->
+                    Error
+            end;
+        _ ->
+            no_tokmatch
+    end;
+slurp_token_of_type(ws, Pos, SrcStr) ->
+    WhitespaceMatcher = sfc_strmatch:smr_sf_ws(),
+    case sfc_strmatch:match(WhitespaceMatcher, SrcStr) of
+        no_strmatch ->
+            no_tokmatch;
+        {strmatch, WS, Rest} ->
+            Token = #sfc_token{type   = ws,
+                                   pos    = Pos,
+                                   string = WS},
+            {tokmatch, Token, Rest}
+    end;
+% KEYWORDS, OPERATORS, PUNCTUATION: kwd, op, punct
+%
+% all the kwds are valid ids, so we match as an id and then check if it's a
+% kwd
+%
+% kwds are allowed to be prefixes for user-defined variable names; e.g.
+% "lettuce" should be parsed as an id, not as ["let", "tuce"]; for this reason
+% we need to be careful with greedily parsing kwds
+%
+% we know kwds are always ids, so we parse it as an id and see if it's one
+% of the kwds
+slurp_token_of_type(kwd, Pos, SrcStr) ->
+    case slurp_token_of_type(id, Pos, SrcStr) of
+        {tokmatch, IdTok = #sfc_token{string = IdStr}, Rest} ->
+            case lists:member(IdStr, kwds()) of
+                false ->
+                    no_tokmatch;
+                true ->
+                    KwTok = IdTok#sfc_token{type = kwd},
+                    {tokmatch, KwTok, Rest}
+            end;
+        no_tokmatch ->
+            no_tokmatch
+    end;
+slurp_token_of_type(op, Pos, SrcStr) ->
+    case sfc_strmatch:match(sfc_strmatch:smr_sf_op(), SrcStr) of
+        {strmatch, Str, Rest} ->
+            Token = #sfc_token{type = op, pos = Pos, string = Str},
+            {tokmatch, Token, Rest};
+        no_strmatch ->
+            no_tokmatch
+    end;
+slurp_token_of_type(punct, Pos, SrcStr) ->
+    case sfc_strmatch:match(sfc_strmatch:smr_sf_punct(), SrcStr) of
+        {strmatch, Str, Rest} ->
+            Token = #sfc_token{type = punct, pos = Pos, string = Str},
+            {tokmatch, Token, Rest};
+        no_strmatch ->
+            no_tokmatch
+    end;
+% SOPHIA VARIABLE NAMES: id, con, qid, qcon, tvar
+slurp_token_of_type(id, Pos, SrcStr) ->
+    case sfc_strmatch:match(sfc_strmatch:smr_sf_id(), SrcStr) of
+        {strmatch, IdStr, Rest} ->
+            Token = #sfc_token{type = id, pos = Pos, string = IdStr},
+            {tokmatch, Token, Rest};
+        no_strmatch ->
+            no_tokmatch
+    end;
+slurp_token_of_type(con, Pos, SrcStr) ->
+    case sfc_strmatch:match(sfc_strmatch:smr_sf_con(), SrcStr) of
+        {strmatch, Str, Rest} ->
+            Token = #sfc_token{type = con, pos = Pos, string = Str},
+            {tokmatch, Token, Rest};
+        no_strmatch ->
+            no_tokmatch
+    end;
+slurp_token_of_type(qid, Pos, SrcStr) ->
+    case sfc_strmatch:match(sfc_strmatch:smr_sf_qid(), SrcStr) of
+        {strmatch, Str, Rest} ->
+            Token = #sfc_token{type = qid, pos = Pos, string = Str},
+            {tokmatch, Token, Rest};
+        no_strmatch ->
+            no_tokmatch
+    end;
+slurp_token_of_type(qcon, Pos, SrcStr) ->
+    case sfc_strmatch:match(sfc_strmatch:smr_sf_qcon(), SrcStr) of
+        {strmatch, Str, Rest} ->
+            Token = #sfc_token{type = qcon, pos = Pos, string = Str},
+            {tokmatch, Token, Rest};
+        no_strmatch ->
+            no_tokmatch
+    end;
+slurp_token_of_type(tvar, Pos, SrcStr) ->
+    case sfc_strmatch:match(sfc_strmatch:smr_sf_tvar(), SrcStr) of
+        {strmatch, Str, Rest} ->
+            Token = #sfc_token{type = tvar, pos = Pos, string = Str},
+            {tokmatch, Token, Rest};
+        no_strmatch ->
+            no_tokmatch
+    end;
+slurp_token_of_type(int16, Pos, SrcStr) ->
+    case sfc_strmatch:match(sfc_strmatch:smr_sf_int16(), SrcStr) of
+        {strmatch, Str, Rest} ->
+            Token = #sfc_token{type = int16, pos = Pos, string = Str},
+            {tokmatch, Token, Rest};
+        no_strmatch ->
+            no_tokmatch
+    end;
+slurp_token_of_type(int10, Pos, SrcStr) ->
+    case sfc_strmatch:match(sfc_strmatch:smr_sf_int10(), SrcStr) of
+        {strmatch, Str, Rest} ->
+            Token = #sfc_token{type = int10, pos = Pos, string = Str},
+            {tokmatch, Token, Rest};
+        no_strmatch ->
+            no_tokmatch
+    end;
+% LITERAL PARSERS: char, string, hex, int, bytes10, bytes16,
+%                  ak, ct, sg
+%
+% char: sophia char literal
+slurp_token_of_type(ak, Pos, SrcStr) ->
+    StringMatcher = sfc_strmatch:smr_sf_ak(),
+    case sfc_strmatch:match(StringMatcher, SrcStr) of
+        no_strmatch ->
+            no_tokmatch;
+        {strmatch, TokenStr, Rest} ->
+            Token = #sfc_token{type = ak, pos = Pos, string = TokenStr},
+            {tokmatch, Token, Rest}
+    end;
+slurp_token_of_type(ct, Pos, SrcStr) ->
+    StringMatcher = sfc_strmatch:smr_sf_ct(),
+    case sfc_strmatch:match(StringMatcher, SrcStr) of
+        no_strmatch ->
+            no_tokmatch;
+        {strmatch, TokenStr, Rest} ->
+            Token = #sfc_token{type = ct, pos = Pos, string = TokenStr},
+            {tokmatch, Token, Rest}
+    end;
+slurp_token_of_type(sg, Pos, SrcStr) ->
+    StringMatcher = sfc_strmatch:smr_sf_sg(),
+    case sfc_strmatch:match(StringMatcher, SrcStr) of
+        no_strmatch ->
+            no_tokmatch;
+        {strmatch, TokenStr, Rest} ->
+            Token = #sfc_token{type = sg, pos = Pos, string = TokenStr},
+            {tokmatch, Token, Rest}
+    end;
+slurp_token_of_type(char, Pos, SrcStr) ->
+    StringMatcher = sfc_strmatch:smr_sf_char(),
+    case sfc_strmatch:match(StringMatcher, SrcStr) of
+        no_strmatch ->
+            no_tokmatch;
+        {strmatch, TokenStr, Rest} ->
+            Token = #sfc_token{type = char, pos = Pos, string = TokenStr},
+            {tokmatch, Token, Rest}
+    end;
+slurp_token_of_type(string, Pos, SrcStr) ->
+    case sfc_strmatch:match(sfc_strmatch:smr_sf_str(), SrcStr) of
+        no_strmatch ->
+            no_tokmatch;
+        {strmatch, TokenStr, Rest} ->
+            Token = #sfc_token{type = string, pos = Pos, string = TokenStr},
+            {tokmatch, Token, Rest}
+    end;
+slurp_token_of_type(bytes, Pos, SrcStr) ->
+    case sfc_strmatch:match(sfc_strmatch:smr_sf_bytes(), SrcStr) of
+        no_strmatch ->
+            no_tokmatch;
+        {strmatch, TokenStr, Rest} ->
+            Token = #sfc_token{type = bytes, pos = Pos, string = TokenStr},
+            {tokmatch, Token, Rest}
+    end;
+slurp_token_of_type(NyiType, Pos, SrcStr) ->
+    Message = io_lib:format("cannot slurp token of type: ~p", [NyiType]),
+    error(#sfc_err{atom   = nyi,
+                   string = Message,
+                   extra  = [{token_type, NyiType},
+                             {pos, Pos},
+                             {rest, SrcStr}]}).
+
+
+
+takeline(Acc, "")               -> {lists:reverse(Acc), ""};
+takeline(Acc, Rest = "\n" ++ _) -> {lists:reverse(Acc), Rest};
+takeline(Acc, [C | Rest])       -> takeline([C | Acc], Rest).
+
+
+bcom(CommentStr, Depth, SrcStr0) when Depth > 0 ->
+    case SrcStr0 of
+        % premature end
+        "" ->
+            {ierr, unterminated_block_comment};
+        % decrease depth
+        "*/" ++ SrcStr1 ->
+            NewCommentStr = [CommentStr, "*/"],
+            NewDepth = Depth - 1,
+            bcom(NewCommentStr, NewDepth, SrcStr1);
+        % increase depth
+        "/*" ++ SrcStr1 ->
+            NewCommentStr = [CommentStr, "/*"],
+            NewDepth = Depth + 1,
+            bcom(NewCommentStr, NewDepth, SrcStr1);
+        % same depth, add to list
+        [C | SrcStr1] ->
+            NewCommentStr = [CommentStr, C],
+            bcom(NewCommentStr, Depth, SrcStr1)
+    end;
+bcom(CommentStr, 0, SrcStr) ->
+    {ok, unicode:characters_to_nfc_list(CommentStr), SrcStr}.
+
+
+%------------------------------------------
+% INTERNAL UTILITIES
+%------------------------------------------
+
+-spec take_while(Pred, List) -> {Taken, Rest}
+    when Pred  :: fun((Item) -> boolean()),
+         List  :: [Item],
+         Taken :: List,
+         Rest  :: List.
+% @doc similar to lists:takewhile but returns {Taken, Rest}.  Name is
+% to remind you it returns 2 things.
+
+take_while(Pred, List) ->
+    take_while(Pred, [], List).
+
+
+-spec take_while(Pred, Prefix, List) -> {Taken, Rest}
+    when Pred  :: fun((Item) -> boolean()),
+         Prefix :: List,
+         List   :: [Item],
+         Taken  :: List,
+         Rest   :: List.
+% @doc
+% similar to takewhile_ii/2, but returns {Prefix ++ Taken, Rest}
+%
+% where Prefix
+%
+% middle argument is just the accum
+take_while(Pred, Pfx, List) ->
+    tw3(Pred, lists:reverse(Pfx), List).
+
+
+tw3(Pred, Stk, [X | Xs]) ->
+    case Pred(X) of
+        true  -> tw3(Pred, [X | Stk], Xs);
+        false -> {lists:reverse(Stk), [X | Xs]}
+    end;
+tw3(_, Stk, []) ->
+    {lists:reverse(Stk), []}.
				`@@ -0,0 +1 @@`
				`{"src/*", [debug_info, {i, "include/"}, {outdir, "ebin/"}]}.`