stuff

2026-06-08 19:02:27 -07:00
parent 70643184c3
commit 40f4ce1e87
9 changed files with 572 additions and 344 deletions
@@ -26,7 +26,7 @@ Title                       | Brief Description
 [[Serializations]]          | Conventions for field order in Gajumaru data structures
 [[Smart Contracts]]         | Terminology
 [[Sophia]]                  | Introduction to Sophia, the Gajumaru smart contract language
-[[Sophia FAQ]]              | what it says
+[[Sophia FQA]]              | what it says
 [[State Channels]]          | Overview and characteristics
 [[Testnet Node Setup]]      | Tech support
 [[Transaction]]             | Terminology
@@ -1,259 +0,0 @@
 # Sophia FAQ
 - Created: 2026-03-30
 - Authors: Peter Harpending `<peterharpending@qpq.swiss>`
 - Last Modified: 2026-04-07
 # References
 - [Sophia docs](https://git.qpq.swiss/QPQ-AG/sophia/src/branch/master/docs)
 - [Protocol docs](https://git.qpq.swiss/QPQ-AG/protocol)
 # Defining Events in interfaces
 apparently this is legal syntax but the point of this is unclear.
 # Can there be the same function name with different arities?
 # What happens if you delete a non-existent key from a map?
 # How does sophia compilation work
 From commit `dbab49936daad7d82bae7cf7336b1ce82e7ab779`
 ```erlang
 % so_compiler.erl:84
 -spec file(string()) -> {ok, map()} | {error, [so_errors:error()]}.
 file(Filename) ->
    file(Filename, []).
 -spec file(string(), options()) -> {ok, map()} | {error, [so_errors:error()]}.
 file(File, Options0) ->
    Options = add_include_path(File, Options0),
    case read_contract(File) of
        {ok, Bin} ->
            SrcDir = so_utils:canonical_dir(filename:dirname(File)),
            from_string(Bin, [{src_file, File}, {src_dir, SrcDir} | Options]);
        {error, Error} ->
            Msg = lists:flatten([File,": ",file:format_error(Error)]),
            {error, [so_errors:new(file_error, Msg)]}
    end.
 -spec from_string(binary() | string(), options()) -> {ok, map()} | {error, [so_errors:error()]}.
 from_string(ContractBin, Options) when is_binary(ContractBin) ->
    from_string(binary_to_list(ContractBin), Options);
 from_string(ContractString, Options) ->
    try
        from_string1(ContractString, Options)
    catch
        throw:{error, Errors} -> {error, Errors}
    end.
 from_string1(ContractString, Options) ->
    #{ fcode := FCode
     , fcode_env := FCodeEnv
     , folded_typed_ast := FoldedTypedAst
     , warnings := Warnings } = string_to_code(ContractString, Options),
    #{ child_con_env := ChildContracts } = FCodeEnv,
    SavedFreshNames = maps:get(saved_fresh_names, FCodeEnv, #{}),
    FateCode = so_fcode_to_fate:compile(ChildContracts, FCode, SavedFreshNames, Options),
    pp_assembler(FateCode, Options),
    ByteCode = gmb_fate_code:serialize(FateCode, []),
    {ok, Version} = version(),
    Res = #{byte_code => ByteCode,
            compiler_version => Version,
            contract_source => ContractString,
            type_info => [],
            fate_code => FateCode,
            abi_version => gmb_fate_abi:abi_version(),
            payable => maps:get(payable, FCode),
            warnings => Warnings
           },
    {ok, maybe_generate_aci(Res, FoldedTypedAst, Options)}.
 ```
 So a lot is going on in `string_to_code/2`
 ```erlang
 -spec string_to_code(string(), options()) -> map().
 string_to_code(ContractString, Options) ->
    Ast = parse(ContractString, Options),
    pp_sophia_code(Ast, Options),
    pp_ast(Ast, Options),
    {TypeEnv, FoldedTypedAst, UnfoldedTypedAst, Warnings} = so_ast_infer_types:infer(Ast, [return_env | Options]),
    pp_typed_ast(UnfoldedTypedAst, Options),
    {Env, Fcode} = so_ast_to_fcode:ast_to_fcode(UnfoldedTypedAst, [{original_src, ContractString}|Options]),
    #{ fcode => Fcode
    ,  fcode_env => Env
    ,  unfolded_typed_ast => UnfoldedTypedAst
    ,  folded_typed_ast => FoldedTypedAst
    ,  type_env  => TypeEnv
    ,  ast => Ast
    ,  warnings => Warnings }.
 -spec parse(string(), so_compiler:options()) -> none() | so_syntax:ast().
 parse(Text, Options) ->
    parse(Text, sets:new(), Options).
 -spec parse(string(), sets:set(), so_compiler:options()) -> none() | so_syntax:ast().
 parse(Text, Included, Options) ->
    so_parser:string(Text, Included, Options).
 ```
 So we get an AST from `so_parser:string/3`
 ```
 %% so_parser.erl
 -spec string(string(), sets:set(include_hash()), so_compiler:options()) -> parse_result().
 string(String, Included, Opts) ->
    AST = run_parser(file(), String, Opts),
    case expand_includes(AST, Included, Opts) of
        {ok, AST1}   -> AST1;
        {error, Err} -> parse_error(Err)
    end.
 run_parser(P, Inp) ->
    escape_errors(parse_and_scan(P, Inp, [])).
 run_parser(P, Inp, Opts) ->
    escape_errors(parse_and_scan(P, Inp, Opts)).
 parse_and_scan(P, S, Opts) ->
    set_current_file(proplists:get_value(src_file, Opts, no_file)),
    set_current_dir(proplists:get_value(src_dir, Opts, no_file)),
    set_current_include_type(proplists:get_value(include_type, Opts, none)),
    case so_scan:scan(S) of
        {ok, Tokens} -> so_parse_lib:parse(P, Tokens);
        {error, {{Input, Pos}, _}} ->
            {error, {Pos, scan_error, Input}}
    end.
 ```
 So there's a lot of metadata being kept, but the key part is the call to
 `so_scan:scan/1`
 ```erl
 lexer() ->
    Number   = fun(Digit) -> [Digit, "+(_", Digit, "+)*"] end,
    DIGIT    = "[0-9]",
    HEXDIGIT = "[0-9a-fA-F]",
    LOWER    = "[a-z_]",
    UPPER    = "[A-Z]",
    CON      = [UPPER, "[a-zA-Z0-9_]*"],
    INT      = Number(DIGIT),
    HEX      = ["0x", Number(HEXDIGIT)],
    BYTES    = ["#", Number(HEXDIGIT)],
    WS       = "[\\000-\\ ]+",
    ID       = [LOWER, "[a-zA-Z0-9_']*"],
    TVAR     = ["'", ID],
    QID      = ["(", CON, "\\.)+", ID],
    QCON     = ["(", CON, "\\.)+", CON],
    OP       = "[=!<>+\\-*/:&|?~@^]+",
    %% Five cases for a character
    %%  * 1 7-bit ascii, not \ or '
    %%  * 2-4 8-bit values (UTF8)
    %%  * \ followed by a known modifier [aernrtv]
    %%  * \xhh
    %%  * \x{hhh...}
    CHAR     = "'(([\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f])|([\\x00-\\xff][\\x80-\\xff]{1,3})|(\\\\[befnrtv'\\\\])|(\\\\x[0-9a-fA-F]{2,2})|(\\\\x\\{[0-9a-fA-F]*\\}))'",
    STRING   = "\"([^\"\\\\]|(\\\\.))*\"",
    CommentStart = {"/\\*", push(comment, skip())},
    CommentRules =
        [ CommentStart
        , {"\\*/",        pop(skip())}
        , {"[^/*]+|[/*]", skip()} ],
    Keywords = ["contract", "include", "let", "switch", "type", "record", "datatype", "if", "elif", "else", "function",
                "stateful", "payable", "true", "false", "mod", "public", "entrypoint", "private", "indexed", "namespace",
                "interface", "main", "using", "as", "for", "hiding", "band", "bor", "bxor", "bnot"
               ],
    KW = string:join(Keywords, "|"),
    Rules =
          %% Comments and whitespace
        [ CommentStart
        , {"//.*", skip()}
        , {WS,     skip()}
          %% Special characters
        , {"\\.\\.|[,.;()\\[\\]{}]", symbol()}
          %% Literals
        , {CHAR,   token(char,   fun parse_char/1)}
        , {STRING, token(string, fun parse_string/1)}
        , {HEX,    token(hex,    fun parse_hex/1)}
        , {INT,    token(int,    fun parse_int/1)}
        , {BYTES,  token(bytes,  fun parse_bytes/1)}
          %% Identifiers (qualified first!)
        , {QID,   token(qid,  fun(S) -> string:tokens(S, ".") end)}
        , {QCON,  token(qcon, fun(S) -> string:tokens(S, ".") end)}
        , {TVAR,  token(tvar)}
        , override({ID, token(id)}, {KW, symbol()})    %% Keywords override identifiers. Need to
        , {CON, token(con)}                            %% use override to avoid lexing "lettuce"
                                                       %% as ['let', {id, "tuce"}].
          %% Operators
        , {OP, symbol()}
        ],
    [{code, Rules}, {comment, CommentRules}].
 scan(String) ->
    Lexer = so_scan_lib:compile(lexer()),
    so_scan_lib:string(Lexer, code, String).
 ```
 OK. let's look at `so_scan_lib`
 ```erl
 -type regex()     :: iodata() | unicode:charlist().
 -type pos()       :: {integer(), integer()}.
 -type lex_state() :: atom().
 -type token()     :: {atom(), pos(), term()} | {atom(), pos()}.
 -type token_spec()     :: {regex(), token_action()}.
 -opaque token_action() :: fun((string(), pos()) -> {tok_result(), state_change()}).
 -opaque lexer() :: [{lex_state(),
                     fun((string(), pos()) -> {ok, tok_result(), string(), pos()}
                                            | end_of_file | error)}].
 %% -- Internal types --
 -type tok_result()   :: {token, token()} | skip.
 -type state_change() :: none | pop | {push, lex_state()}.
 %% @doc Compile a lexer specification. Takes the regexps for each state and
 %% combines them into a single big regexp that is then compiled with re:compile/1.
 %% Note: contrary to lexer generators like leex, we don't have longest match
 %% semantics (since this isn't supported by re). Use override/2 instead.
 -spec compile([{lex_state(), [token_spec()]}]) -> lexer().
 compile(TokenSpecs) ->
    [{S, compile_spec(Spec)} || {S, Spec} <- TokenSpecs].
 compile_spec(TokenSpecs) ->
    WithIxs     = lists:zip(lists:seq(1, length(TokenSpecs)), TokenSpecs),
    {ok, Regex} = re:compile(["^(", name(0), string:join([ ["(", name(I), R, ")"] || {I, {R, _}} <- WithIxs ], "|"),")"]),
    Actions     = [ Fun || {_, Fun} <- TokenSpecs ],
    fun ("", _Pos) -> end_of_file;
        (S, Pos)  ->
            case re:run(S, Regex, [{capture, all_names}]) of
                {match, [{0, N} | Capture]} ->
                    Index        = 1 + length(lists:takewhile(fun({P, _}) -> P == -1 end, Capture)),
                    Action       = lists:nth(Index, Actions),
                    {TokS, Rest} = lists:split(N, S),
                    Tok          = Action(TokS, Pos),
                    {ok, Tok, Rest, next_pos(TokS, Pos)};
                nomatch ->
                    error
            end
    end.
 ```
 # How does sophia compilation work
@@ -0,0 +1,569 @@
 # Sophia Frequently Questioned Answers
 - Created: 2026-03-30
 - Authors: Peter Harpending `<peterharpending@qpq.swiss>`
 - Last Modified: 2026-06-04
 # References
 - [Sophia Compiler][so]
 - [Sophia docs][so-docs]
 - [Protocol docs](https://git.qpq.swiss/QPQ-AG/protocol)
 - [GSC][gsc]
 - [GSC Token definition](https://git.qpq.swiss/QPQ-AG/gsc/src/commit/ba70aace96ed73138496744f7d90c2666428eafc/include/gsc.hrl#L45-L50)
 [gsc]: https://git.qpq.swiss/QPQ-AG/gsc
 [so]: https://git.qpq.swiss/QPQ-AG/sophia
 [so/docs]: https://git.qpq.swiss/QPQ-AG/sophia/src/branch/master/docs
 # GSC
 [GSC (= "gajumaru sophia compiler")][gsc] is an experimental
 work-in-progress maybe-will-be-finished-maybe-won't Sophia compiler
 that I (PRH) wrote in an effort to experiment with and document the
 Sophia language.
 It is used a lot to illustrate things in this document, so you might
 want to download it and get it to work on your machine. The goal in
 version 0.1 is to match the exact behavior of the [legacy Sophia
 compiler][so].
 I wrote gsc mostly because I got nerdsniped by the problems that gsc
 must solve in order to work. In the interest of retroactively
 justifying being nerdsniped, I will note that the legacy compiler
 1.  is *extremely* central to Gajumaru's trust model
 2.  has many serious-to-semi-serious bugs/warts/edge-cases which
    (prior to this work) were either unknown or
    known-but-not-documented; e.g., unterminated block comments at
    the end of files are admissible provided what precedes is a valid
    Sophia contract:
    ```sophia
    contract Test =
        type state = unit
        entrypoint init() : state =
            ()
    /*
    according to the legacy sophia compiler, this is a totally 100%
    legal sophia contract that ends with an unterminated block
    comment
    ```
 # Architecture of the Sophia Compiler
 First some disclaimers:
 1. **Compilers are _NOT_ magic incomprehensible black boxes** that
   are totally inaccessible to ordinary programmers.  (If you
   encounter one that is, that says more about the compiler and its
   authors than it does about you...). Compilers simply translate a
   well-specified input format into a well-specified output format.
   **Compilers are just ordinary pieces of software that work the
   same way every other piece of software does.**
 2.  Like all other types of software, **compilers have bugs and
    strange unexpected corner cases**. A (the?) purpose of this
    document is to write down all such cases that I have encountered
    thus far in this nerdsnipe adventure.
 Most compilers have some variation of the following architecture:
 1.  **Tokenization** (also called **lexical analysis**); this step
    takes the flat array of input characters found in the source code
    and discovers the "chunk boundaries" in the file:
    ![](./uploads/tokens-c.png)
    Each chunk is called a "token".
 2.  **Parsing** (also called **syntax analysis**); this step takes
    the flat sequence of tokens, and arranges it into a hierarchy
    (usually called an "abstract syntax tree" or "AST").
    The set of rules regarding how the signal is transformed into the
    abstract syntax tree is called the **grammar** of the language.
    ```
    source:
        the quick brown fox jumps over the lazy dog
    signal:
        ["the", "quick", "brown", "fox", "jumps",
         "over", "the", "lazy", "dog"]
    ast:
        (Sentence
            (NounPhrase
                (determiner "the")
                (adjective "quick")
                (adjective "brown")
                (noun "fox"))
            (VerbPhrase
                (verb "jumps")
                (PrepositionalPhrase
                    (preposition "over")
                    (NounPhrase
                        (determiner "the")
                        (adjective "lazy")
                        (noun "dog")))))
    ```
    This is the first step in which we think of a language in terms
    of its **structure** rather than simply being a sequence of
    words/tokens.
 3.  **Semantic analysis**: the compiler transforms the abstract
    syntax tree through a sequence of **intermediate
    representations** (**IR**s).
    This is where compiler engineering gets interesting, and factors
    like artistic choice and taste start to dominate. Different
    optimizations occur at different levels of intermediate
    represntation. The structure of this meta-step depends heavily on
    the source and target languages, problem domains, goals of the
    specific compiler, etc.
    This is the step in which we think of phrases in the language in
    terms of their **meaning** rather than in some strict notion of
    valid vs. invalid.
 4.  **Code generation**: once the compiler has completed its analysis
    of the input data, and figured out in some precise way what the
    author of the input was attempting to express notionally, it's
    finally time to express said notion in the target language.
 At the time of writing (June 2026), only GSC's tokenizer has been
 fully ironed out and thoroughly tested, the discussion of which will
 constitute the remainder of this document.
 # PITFALL WARNING! TERMINOLOGY COLLISION re "tokens" vs. gsc "signal"
 What most compilers call "tokens", gsc calls "signal".
 GSC classifies tokens into "signal" and "noise"; "noise" means
 comments and whitespace, and "signal" is everything else.
 Most compilers discard "noise" tokens (comments and whitespace). GSC
 retains them for two reasons:
 1. sanity-checking to make sure information isn't lost on accident;
   e.g. one of gsc's tests
 2. future-proofing in case we want to add Python/Lisp
 style doc comments as a language feature down the line.
 ```python
 def foo():
    "this is a doc comment for foo"
    print("hi from foo")
 ```
 ![](./python-doc1.png)
 ![](./python-doc2.png)
 However for non-bikeshed compiler tasks (figuring out what the code
 is supposed to *do* and then expressing that in the target language),
 noise tokens are entirely irrelevant.
 # What is a token?
 Tokens are the "chunk boundaries" of source files.
 ![](./uploads/tokens-c.png)
 This is roughly analogous to "word boundaries" in natural language;
 we can hack together a string-splitting function in the Erlang shell
 to illustrate the notion:
 ```erlang
 11> Intersperse = fun I([], _Sep) -> []; I([Last], _Sep) -> [Last]; I([One | More], Sep) -> [One, Sep | I(More, Sep)] end.
 #Fun<erl_eval.18.113135111>
 12> Intersperse(["foo", "bar", "baz"], " ").
 ["foo"," ","bar"," ","baz"]
 13> string:tokens("foo bar baz", " ").
 ["foo","bar","baz"]
 14> TokensEn = fun(SrcStr) -> Sep = " ", Signal = string:tokens(SrcStr, Sep), Tokens = Intersperse(Signal, Sep), Tokens end.
 #Fun<erl_eval.42.113135111>
 15> TokensEn("foo bar baz").
 ["foo"," ","bar"," ","baz"]
 16> TokensEn("The quick brown fox jumped over the lazy dog").
 ["The"," ","quick"," ","brown"," ","fox"," ","jumped"," ",
 "over"," ","the"," ","lazy"," ","dog"]
 ```
 You can see the pitfall regarding termionology collision present in
 the behavior of the Erlang standard library `string:tokens/2`
 function, which discards the separator characters:
 ```erlang
 17> string:tokens("foo.bar.baz", ".").
 ["foo","bar","baz"]
 18> string:tokens("foo.bar,baz", ",").
 ["foo.bar","baz"]
 ```
 # Sophia Tokens
 ```erlang
 -type tk_shape()
    :: bcom      % /* ... */
     | lcom      % //
     | ws        % whitespace
     % literals
     | char      % 'a'
     | string    % "foo"
     | int10     % 69_420
     | int16     % 0xDEAD_BEEF
     | bytes     % #DEAD_BEEF
     | ak        % ak_ABC
     | ct        % ct_ABC
     | sg        % sg_ABC
     % kwds/variables/etc
     | id         % foo, foo_bar, foo_bar'baz'  _'foo'
     | con        % Foo, Foo_Bar, FooBar
     | qid        % Foo.Bar.baz
     | qcon       % Foo.Bar.Baz
     | tvar       % 'foo, 'foo_bar, '_'foo'_'bar'''
     % kwds ops and sep are all collapsed by
     % so_scan:scan down to eg {'contract', {420, 69}}
     % where {420, 69} is the source location
     % these are three different parsers
     | kwd        % contract, interface, payable, etc
     | op         % "=!<>+-*/:&|?~@^"
     | sep      % ".." | oneof(",.;()[]{}")
     % kwds and sep are kind of the same thing
     % but i'll keep them separate now for my own sanity. ok
     % i guess op or symbol or whatever is fine.
     %
     % not going to overthink. if having them separate
     % becomes an issue it's easy enough to collapse. harder
     % to separate afterward if collapsing is wrong.
     .
 -type tk_pos() :: {Line :: pos_integer(), Col :: pos_integer()}.
 -record(tk,
        {shape  :: tk_shape(),
         pos    :: tk_pos(),
         str :: string()}).
 -type tk() :: #tk{}.
 ```
 Concretely:
 ```sophia
 // Hello World Contract
 // Copyright (c) 2025 QPQ AG
 contract Hello =
    type state = unit
    entrypoint init(): state =
        ()
    entrypoint hello(): string =
        "hello, world"
 ```
 ![](./uploads/tokens-c.png)
 ```erlang
 [pharpend@desktop ioecs/gsc master] % gsc tokens test/ct/hello.aes
 {tk,lcom,{1,1},"// Hello World Contract"}
 {tk,ws,{1,24},"\n"}
 {tk,lcom,{2,1},"// Copyright (c) 2025 QPQ AG"}
 {tk,ws,{2,29},"\n\n"}
 {tk,kwd,{4,1},"contract"}
 {tk,ws,{4,9}," "}
 {tk,con,{4,10},"Hello"}
 {tk,ws,{4,15}," "}
 {tk,op,{4,16},"="}
 {tk,ws,{4,17},"\n    "}
 {tk,kwd,{5,5},"type"}
 {tk,ws,{5,9}," "}
 {tk,id,{5,10},"state"}
 {tk,ws,{5,15}," "}
 {tk,op,{5,16},"="}
 {tk,ws,{5,17}," "}
 {tk,id,{5,18},"unit"}
 {tk,ws,{5,22},"\n    "}
 {tk,kwd,{6,5},"entrypoint"}
 {tk,ws,{6,15}," "}
 {tk,id,{6,16},"init"}
 {tk,sep,{6,20},"("}
 {tk,sep,{6,21},")"}
 {tk,op,{6,22},":"}
 {tk,ws,{6,23}," "}
 {tk,id,{6,24},"state"}
 {tk,ws,{6,29}," "}
 {tk,op,{6,30},"="}
 {tk,ws,{6,31},"\n        "}
 {tk,sep,{7,9},"("}
 {tk,sep,{7,10},")"}
 {tk,ws,{7,11},"\n\n    "}
 {tk,kwd,{9,5},"entrypoint"}
 {tk,ws,{9,15}," "}
 {tk,id,{9,16},"hello"}
 {tk,sep,{9,21},"("}
 {tk,sep,{9,22},")"}
 {tk,op,{9,23},":"}
 {tk,ws,{9,24}," "}
 {tk,id,{9,25},"string"}
 {tk,ws,{9,31}," "}
 {tk,op,{9,32},"="}
 {tk,ws,{9,33},"\n        "}
 {tk,string,{10,9},"\"hello, world\""}
 {tk,ws,{10,23},"\n"}
 ```
 # Defining Events in interfaces
 apparently this is legal syntax but the point of this is unclear.
 # Can there be the same function name with different arities?
 # What happens if you delete a non-existent key from a map?
 # How does sophia compilation work
 From commit `dbab49936daad7d82bae7cf7336b1ce82e7ab779`
 ```erlang
 % so_compiler.erl:84
 -spec file(string()) -> {ok, map()} | {error, [so_errors:error()]}.
 file(Filename) ->
    file(Filename, []).
 -spec file(string(), options()) -> {ok, map()} | {error, [so_errors:error()]}.
 file(File, Options0) ->
    Options = add_include_path(File, Options0),
    case read_contract(File) of
        {ok, Bin} ->
            SrcDir = so_utils:canonical_dir(filename:dirname(File)),
            from_string(Bin, [{src_file, File}, {src_dir, SrcDir} | Options]);
        {error, Error} ->
            Msg = lists:flatten([File,": ",file:format_error(Error)]),
            {error, [so_errors:new(file_error, Msg)]}
    end.
 -spec from_string(binary() | string(), options()) -> {ok, map()} | {error, [so_errors:error()]}.
 from_string(ContractBin, Options) when is_binary(ContractBin) ->
    from_string(binary_to_list(ContractBin), Options);
 from_string(ContractString, Options) ->
    try
        from_string1(ContractString, Options)
    catch
        throw:{error, Errors} -> {error, Errors}
    end.
 from_string1(ContractString, Options) ->
    #{ fcode := FCode
     , fcode_env := FCodeEnv
     , folded_typed_ast := FoldedTypedAst
     , warnings := Warnings } = string_to_code(ContractString, Options),
    #{ child_con_env := ChildContracts } = FCodeEnv,
    SavedFreshNames = maps:get(saved_fresh_names, FCodeEnv, #{}),
    FateCode = so_fcode_to_fate:compile(ChildContracts, FCode, SavedFreshNames, Options),
    pp_assembler(FateCode, Options),
    ByteCode = gmb_fate_code:serialize(FateCode, []),
    {ok, Version} = version(),
    Res = #{byte_code => ByteCode,
            compiler_version => Version,
            contract_source => ContractString,
            type_info => [],
            fate_code => FateCode,
            abi_version => gmb_fate_abi:abi_version(),
            payable => maps:get(payable, FCode),
            warnings => Warnings
           },
    {ok, maybe_generate_aci(Res, FoldedTypedAst, Options)}.
 ```
 So a lot is going on in `string_to_code/2`
 ```erlang
 -spec string_to_code(string(), options()) -> map().
 string_to_code(ContractString, Options) ->
    Ast = parse(ContractString, Options),
    pp_sophia_code(Ast, Options),
    pp_ast(Ast, Options),
    {TypeEnv, FoldedTypedAst, UnfoldedTypedAst, Warnings} = so_ast_infer_types:infer(Ast, [return_env | Options]),
    pp_typed_ast(UnfoldedTypedAst, Options),
    {Env, Fcode} = so_ast_to_fcode:ast_to_fcode(UnfoldedTypedAst, [{original_src, ContractString}|Options]),
    #{ fcode => Fcode
    ,  fcode_env => Env
    ,  unfolded_typed_ast => UnfoldedTypedAst
    ,  folded_typed_ast => FoldedTypedAst
    ,  type_env  => TypeEnv
    ,  ast => Ast
    ,  warnings => Warnings }.
 -spec parse(string(), so_compiler:options()) -> none() | so_syntax:ast().
 parse(Text, Options) ->
    parse(Text, sets:new(), Options).
 -spec parse(string(), sets:set(), so_compiler:options()) -> none() | so_syntax:ast().
 parse(Text, Included, Options) ->
    so_parser:string(Text, Included, Options).
 ```
 So we get an AST from `so_parser:string/3`
 ```
 %% so_parser.erl
 -spec string(string(), sets:set(include_hash()), so_compiler:options()) -> parse_result().
 string(String, Included, Opts) ->
    AST = run_parser(file(), String, Opts),
    case expand_includes(AST, Included, Opts) of
        {ok, AST1}   -> AST1;
        {error, Err} -> parse_error(Err)
    end.
 run_parser(P, Inp) ->
    escape_errors(parse_and_scan(P, Inp, [])).
 run_parser(P, Inp, Opts) ->
    escape_errors(parse_and_scan(P, Inp, Opts)).
 parse_and_scan(P, S, Opts) ->
    set_current_file(proplists:get_value(src_file, Opts, no_file)),
    set_current_dir(proplists:get_value(src_dir, Opts, no_file)),
    set_current_include_type(proplists:get_value(include_type, Opts, none)),
    case so_scan:scan(S) of
        {ok, Tokens} -> so_parse_lib:parse(P, Tokens);
        {error, {{Input, Pos}, _}} ->
            {error, {Pos, scan_error, Input}}
    end.
 ```
 So there's a lot of metadata being kept, but the key part is the call to
 `so_scan:scan/1`
 ```erl
 lexer() ->
    Number   = fun(Digit) -> [Digit, "+(_", Digit, "+)*"] end,
    DIGIT    = "[0-9]",
    HEXDIGIT = "[0-9a-fA-F]",
    LOWER    = "[a-z_]",
    UPPER    = "[A-Z]",
    CON      = [UPPER, "[a-zA-Z0-9_]*"],
    INT      = Number(DIGIT),
    HEX      = ["0x", Number(HEXDIGIT)],
    BYTES    = ["#", Number(HEXDIGIT)],
    WS       = "[\\000-\\ ]+",
    ID       = [LOWER, "[a-zA-Z0-9_']*"],
    TVAR     = ["'", ID],
    QID      = ["(", CON, "\\.)+", ID],
    QCON     = ["(", CON, "\\.)+", CON],
    OP       = "[=!<>+\\-*/:&|?~@^]+",
    %% Five cases for a character
    %%  * 1 7-bit ascii, not \ or '
    %%  * 2-4 8-bit values (UTF8)
    %%  * \ followed by a known modifier [aernrtv]
    %%  * \xhh
    %%  * \x{hhh...}
    CHAR     = "'(([\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f])|([\\x00-\\xff][\\x80-\\xff]{1,3})|(\\\\[befnrtv'\\\\])|(\\\\x[0-9a-fA-F]{2,2})|(\\\\x\\{[0-9a-fA-F]*\\}))'",
    STRING   = "\"([^\"\\\\]|(\\\\.))*\"",
    CommentStart = {"/\\*", push(comment, skip())},
    CommentRules =
        [ CommentStart
        , {"\\*/",        pop(skip())}
        , {"[^/*]+|[/*]", skip()} ],
    Keywords = ["contract", "include", "let", "switch", "type", "record", "datatype", "if", "elif", "else", "function",
                "stateful", "payable", "true", "false", "mod", "public", "entrypoint", "private", "indexed", "namespace",
                "interface", "main", "using", "as", "for", "hiding", "band", "bor", "bxor", "bnot"
               ],
    KW = string:join(Keywords, "|"),
    Rules =
          %% Comments and whitespace
        [ CommentStart
        , {"//.*", skip()}
        , {WS,     skip()}
          %% Special characters
        , {"\\.\\.|[,.;()\\[\\]{}]", symbol()}
          %% Literals
        , {CHAR,   token(char,   fun parse_char/1)}
        , {STRING, token(string, fun parse_string/1)}
        , {HEX,    token(hex,    fun parse_hex/1)}
        , {INT,    token(int,    fun parse_int/1)}
        , {BYTES,  token(bytes,  fun parse_bytes/1)}
          %% Identifiers (qualified first!)
        , {QID,   token(qid,  fun(S) -> string:tokens(S, ".") end)}
        , {QCON,  token(qcon, fun(S) -> string:tokens(S, ".") end)}
        , {TVAR,  token(tvar)}
        , override({ID, token(id)}, {KW, symbol()})    %% Keywords override identifiers. Need to
        , {CON, token(con)}                            %% use override to avoid lexing "lettuce"
                                                       %% as ['let', {id, "tuce"}].
          %% Operators
        , {OP, symbol()}
        ],
    [{code, Rules}, {comment, CommentRules}].
 scan(String) ->
    Lexer = so_scan_lib:compile(lexer()),
    so_scan_lib:string(Lexer, code, String).
 ```
 OK. let's look at `so_scan_lib`
 ```erl
 -type regex()     :: iodata() | unicode:charlist().
 -type pos()       :: {integer(), integer()}.
 -type lex_state() :: atom().
 -type token()     :: {atom(), pos(), term()} | {atom(), pos()}.
 -type token_spec()     :: {regex(), token_action()}.
 -opaque token_action() :: fun((string(), pos()) -> {tok_result(), state_change()}).
 -opaque lexer() :: [{lex_state(),
                     fun((string(), pos()) -> {ok, tok_result(), string(), pos()}
                                            | end_of_file | error)}].
 %% -- Internal types --
 -type tok_result()   :: {token, token()} | skip.
 -type state_change() :: none | pop | {push, lex_state()}.
 %% @doc Compile a lexer specification. Takes the regexps for each state and
 %% combines them into a single big regexp that is then compiled with re:compile/1.
 %% Note: contrary to lexer generators like leex, we don't have longest match
 %% semantics (since this isn't supported by re). Use override/2 instead.
 -spec compile([{lex_state(), [token_spec()]}]) -> lexer().
 compile(TokenSpecs) ->
    [{S, compile_spec(Spec)} || {S, Spec} <- TokenSpecs].
 compile_spec(TokenSpecs) ->
    WithIxs     = lists:zip(lists:seq(1, length(TokenSpecs)), TokenSpecs),
    {ok, Regex} = re:compile(["^(", name(0), string:join([ ["(", name(I), R, ")"] || {I, {R, _}} <- WithIxs ], "|"),")"]),
    Actions     = [ Fun || {_, Fun} <- TokenSpecs ],
    fun ("", _Pos) -> end_of_file;
        (S, Pos)  ->
            case re:run(S, Regex, [{capture, all_names}]) of
                {match, [{0, N} | Capture]} ->
                    Index        = 1 + length(lists:takewhile(fun({P, _}) -> P == -1 end, Capture)),
                    Action       = lists:nth(Index, Actions),
                    {TokS, Rest} = lists:split(N, S),
                    Tok          = Action(TokS, Pos),
                    {ok, Tok, Rest, next_pos(TokS, Pos)};
                nomatch ->
                    error
            end
    end.
 ```
 # How does sophia compilation work
@@ -4,88 +4,6 @@ documenting for myself
 ## Sophia syntax highlighting
-todo. it's on github somewhere, not hard to find
+See: <https://github.com/yinkaenoch/sophia-vim-syntax>
-## fuzzy finding plugin
+Read the link there and do the needful
 this is annoying and requires like 10 minutes of setup.
 BUT this is super helpful in huge repositories such as the node codebase
 ```
 sudo apt install bat fd-find fzf ripgrep
 ```
 (devuan excalibur)
 ripgrep is optional, craig, but the vim plugin needs it if you want to search
 for regexes *inside* files
 say you're trying to quickly remember what the fuck `gmser_id:id()` is. Like is
 that the record or is that the 33-byte tagged public key? I can't remember and
 neither can you
 ![](./uploads/ripgrep-vim.png)
 this saves you like 15 seconds and a bunch of context switching. each time
 the plugin is super annoying to install but basically don't follow any of the
 instructions in the repo. just clone the `fzf.vim` repo on github (google) to
 `~/.vim/bundle/fzf.vim`.
 you also need to tell vim to load the `.vim` file that ships with the package
 ```
 [pharpend@picklet ioecs/GajuDesk master] % dpkg -L fzf
 /.
 /usr
 /usr/bin
 /usr/bin/fzf
 /usr/bin/fzf-tmux
 /usr/share
 /usr/share/doc
 /usr/share/doc/fzf
 /usr/share/doc/fzf/README-VIM.md.gz
 /usr/share/doc/fzf/README.Debian
 /usr/share/doc/fzf/README.md.gz
 /usr/share/doc/fzf/changelog.Debian.amd64.gz
 /usr/share/doc/fzf/changelog.Debian.gz
 /usr/share/doc/fzf/changelog.gz
 /usr/share/doc/fzf/copyright
 /usr/share/doc/fzf/examples
 /usr/share/doc/fzf/examples/completion.bash
 /usr/share/doc/fzf/examples/completion.zsh
 /usr/share/doc/fzf/examples/fzf.vim
 /usr/share/doc/fzf/examples/key-bindings.bash
 /usr/share/doc/fzf/examples/key-bindings.fish
 /usr/share/doc/fzf/examples/key-bindings.zsh
 /usr/share/doc/fzf/examples/plugin
 /usr/share/fish
 /usr/share/fish/vendor_functions.d
 /usr/share/fish/vendor_functions.d/fzf_key_bindings.fish
 /usr/share/man
 /usr/share/man/man1
 /usr/share/man/man1/fzf-tmux.1.gz
 /usr/share/man/man1/fzf.1.gz
 /usr/share/doc/fzf/examples/plugin/fzf.vim
 ```
 last file there. put that file at `~/.vim/autoload/fzf.vim`
 should just work.
 - `:Files` opens the fuzzy file finder
 - `:Rg` is the interactive grep thing shown above
 i have this vimrc:
 ```vim
 let $FZF_DEFAULT_COMMAND = 'fdfind --type f'
 noremap <C-e> :Files<CR>
 noremap <C-r> :Rg<CR>
 ```
 the fdfind thing means fuzzy find doesn't surface files in your .gitignore
 (e.g. beam files, `_build` insanity)
 will try and see