stuff

2026-06-08 19:02:27 -07:00
parent 70643184c3
commit 40f4ce1e87
9 changed files with 572 additions and 344 deletions
@@ -26,7 +26,7 @@ Title                       | Brief Description
 [[Serializations]]          | Conventions for field order in Gajumaru data structures
 [[Smart Contracts]]         | Terminology
 [[Sophia]]                  | Introduction to Sophia, the Gajumaru smart contract language
-[[Sophia FAQ]]              | what it says
+[[Sophia FQA]]              | what it says
 [[State Channels]]          | Overview and characteristics
 [[Testnet Node Setup]]      | Tech support
 [[Transaction]]             | Terminology
@@ -1,259 +0,0 @@
-# Sophia FAQ
-
- Created: 2026-03-30
- Authors: Peter Harpending `<peterharpending@qpq.swiss>`
- Last Modified: 2026-04-07
-
-# References
-
- [Sophia docs](https://git.qpq.swiss/QPQ-AG/sophia/src/branch/master/docs)
- [Protocol docs](https://git.qpq.swiss/QPQ-AG/protocol)
-
-# Defining Events in interfaces
-
-apparently this is legal syntax but the point of this is unclear.
-
-# Can there be the same function name with different arities?
-
-# What happens if you delete a non-existent key from a map?
-
-# How does sophia compilation work
-
-
-From commit `dbab49936daad7d82bae7cf7336b1ce82e7ab779`
-
-```erlang
-% so_compiler.erl:84
-spec file(string()) -> {ok, map()} | {error, [so_errors:error()]}.
-file(Filename) ->
-    file(Filename, []).
-
-spec file(string(), options()) -> {ok, map()} | {error, [so_errors:error()]}.
-file(File, Options0) ->
-    Options = add_include_path(File, Options0),
-    case read_contract(File) of
-        {ok, Bin} ->
-            SrcDir = so_utils:canonical_dir(filename:dirname(File)),
-            from_string(Bin, [{src_file, File}, {src_dir, SrcDir} | Options]);
-        {error, Error} ->
-            Msg = lists:flatten([File,": ",file:format_error(Error)]),
-            {error, [so_errors:new(file_error, Msg)]}
-    end.
-
-spec from_string(binary() | string(), options()) -> {ok, map()} | {error, [so_errors:error()]}.
-from_string(ContractBin, Options) when is_binary(ContractBin) ->
-    from_string(binary_to_list(ContractBin), Options);
-from_string(ContractString, Options) ->
-    try
-        from_string1(ContractString, Options)
-    catch
-        throw:{error, Errors} -> {error, Errors}
-    end.
-
-from_string1(ContractString, Options) ->
-    #{ fcode := FCode
-     , fcode_env := FCodeEnv
-     , folded_typed_ast := FoldedTypedAst
-     , warnings := Warnings } = string_to_code(ContractString, Options),
-    #{ child_con_env := ChildContracts } = FCodeEnv,
-    SavedFreshNames = maps:get(saved_fresh_names, FCodeEnv, #{}),
-    FateCode = so_fcode_to_fate:compile(ChildContracts, FCode, SavedFreshNames, Options),
-    pp_assembler(FateCode, Options),
-    ByteCode = gmb_fate_code:serialize(FateCode, []),
-    {ok, Version} = version(),
-    Res = #{byte_code => ByteCode,
-            compiler_version => Version,
-            contract_source => ContractString,
-            type_info => [],
-            fate_code => FateCode,
-            abi_version => gmb_fate_abi:abi_version(),
-            payable => maps:get(payable, FCode),
-            warnings => Warnings
-           },
-    {ok, maybe_generate_aci(Res, FoldedTypedAst, Options)}.
-
-```
-
-So a lot is going on in `string_to_code/2`
-
-```erlang
-spec string_to_code(string(), options()) -> map().
-string_to_code(ContractString, Options) ->
-    Ast = parse(ContractString, Options),
-    pp_sophia_code(Ast, Options),
-    pp_ast(Ast, Options),
-    {TypeEnv, FoldedTypedAst, UnfoldedTypedAst, Warnings} = so_ast_infer_types:infer(Ast, [return_env | Options]),
-    pp_typed_ast(UnfoldedTypedAst, Options),
-    {Env, Fcode} = so_ast_to_fcode:ast_to_fcode(UnfoldedTypedAst, [{original_src, ContractString}|Options]),
-    #{ fcode => Fcode
-    ,  fcode_env => Env
-    ,  unfolded_typed_ast => UnfoldedTypedAst
-    ,  folded_typed_ast => FoldedTypedAst
-    ,  type_env  => TypeEnv
-    ,  ast => Ast
-    ,  warnings => Warnings }.
-
-
-spec parse(string(), so_compiler:options()) -> none() | so_syntax:ast().
-parse(Text, Options) ->
-    parse(Text, sets:new(), Options).
-
-spec parse(string(), sets:set(), so_compiler:options()) -> none() | so_syntax:ast().
-parse(Text, Included, Options) ->
-    so_parser:string(Text, Included, Options).
-```
-
-So we get an AST from `so_parser:string/3`
-
-```
-%% so_parser.erl
-spec string(string(), sets:set(include_hash()), so_compiler:options()) -> parse_result().
-string(String, Included, Opts) ->
-    AST = run_parser(file(), String, Opts),
-    case expand_includes(AST, Included, Opts) of
-        {ok, AST1}   -> AST1;
-        {error, Err} -> parse_error(Err)
-    end.
-
-
-run_parser(P, Inp) ->
-    escape_errors(parse_and_scan(P, Inp, [])).
-run_parser(P, Inp, Opts) ->
-    escape_errors(parse_and_scan(P, Inp, Opts)).
-
-parse_and_scan(P, S, Opts) ->
-    set_current_file(proplists:get_value(src_file, Opts, no_file)),
-    set_current_dir(proplists:get_value(src_dir, Opts, no_file)),
-    set_current_include_type(proplists:get_value(include_type, Opts, none)),
-    case so_scan:scan(S) of
-        {ok, Tokens} -> so_parse_lib:parse(P, Tokens);
-        {error, {{Input, Pos}, _}} ->
-            {error, {Pos, scan_error, Input}}
-    end.
-
-```
-
-So there's a lot of metadata being kept, but the key part is the call to
-`so_scan:scan/1`
-
-```erl
-lexer() ->
-    Number   = fun(Digit) -> [Digit, "+(_", Digit, "+)*"] end,
-    DIGIT    = "[0-9]",
-    HEXDIGIT = "[0-9a-fA-F]",
-    LOWER    = "[a-z_]",
-    UPPER    = "[A-Z]",
-    CON      = [UPPER, "[a-zA-Z0-9_]*"],
-    INT      = Number(DIGIT),
-    HEX      = ["0x", Number(HEXDIGIT)],
-    BYTES    = ["#", Number(HEXDIGIT)],
-    WS       = "[\\000-\\ ]+",
-    ID       = [LOWER, "[a-zA-Z0-9_']*"],
-    TVAR     = ["'", ID],
-    QID      = ["(", CON, "\\.)+", ID],
-    QCON     = ["(", CON, "\\.)+", CON],
-    OP       = "[=!<>+\\-*/:&|?~@^]+",
-    %% Five cases for a character
-    %%  * 1 7-bit ascii, not \ or '
-    %%  * 2-4 8-bit values (UTF8)
-    %%  * \ followed by a known modifier [aernrtv]
-    %%  * \xhh
-    %%  * \x{hhh...}
-    CHAR     = "'(([\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f])|([\\x00-\\xff][\\x80-\\xff]{1,3})|(\\\\[befnrtv'\\\\])|(\\\\x[0-9a-fA-F]{2,2})|(\\\\x\\{[0-9a-fA-F]*\\}))'",
-    STRING   = "\"([^\"\\\\]|(\\\\.))*\"",
-
-    CommentStart = {"/\\*", push(comment, skip())},
-    CommentRules =
-        [ CommentStart
-        , {"\\*/",        pop(skip())}
-        , {"[^/*]+|[/*]", skip()} ],
-
-    Keywords = ["contract", "include", "let", "switch", "type", "record", "datatype", "if", "elif", "else", "function",
-                "stateful", "payable", "true", "false", "mod", "public", "entrypoint", "private", "indexed", "namespace",
-                "interface", "main", "using", "as", "for", "hiding", "band", "bor", "bxor", "bnot"
-               ],
-    KW = string:join(Keywords, "|"),
-
-    Rules =
-          %% Comments and whitespace
-        [ CommentStart
-        , {"//.*", skip()}
-        , {WS,     skip()}
-
-          %% Special characters
-        , {"\\.\\.|[,.;()\\[\\]{}]", symbol()}
-
-          %% Literals
-        , {CHAR,   token(char,   fun parse_char/1)}
-        , {STRING, token(string, fun parse_string/1)}
-        , {HEX,    token(hex,    fun parse_hex/1)}
-        , {INT,    token(int,    fun parse_int/1)}
-        , {BYTES,  token(bytes,  fun parse_bytes/1)}
-
-          %% Identifiers (qualified first!)
-        , {QID,   token(qid,  fun(S) -> string:tokens(S, ".") end)}
-        , {QCON,  token(qcon, fun(S) -> string:tokens(S, ".") end)}
-        , {TVAR,  token(tvar)}
-        , override({ID, token(id)}, {KW, symbol()})    %% Keywords override identifiers. Need to
-        , {CON, token(con)}                            %% use override to avoid lexing "lettuce"
-                                                       %% as ['let', {id, "tuce"}].
-          %% Operators
-        , {OP, symbol()}
-        ],
-
-    [{code, Rules}, {comment, CommentRules}].
-
-scan(String) ->
-    Lexer = so_scan_lib:compile(lexer()),
-    so_scan_lib:string(Lexer, code, String).
-```
-
-OK. let's look at `so_scan_lib`
-
-```erl
-type regex()     :: iodata() | unicode:charlist().
-type pos()       :: {integer(), integer()}.
-type lex_state() :: atom().
-type token()     :: {atom(), pos(), term()} | {atom(), pos()}.
-
-type token_spec()     :: {regex(), token_action()}.
-opaque token_action() :: fun((string(), pos()) -> {tok_result(), state_change()}).
-
-opaque lexer() :: [{lex_state(),
-                     fun((string(), pos()) -> {ok, tok_result(), string(), pos()}
-                                            | end_of_file | error)}].
-
-%% -- Internal types --
-type tok_result()   :: {token, token()} | skip.
-type state_change() :: none | pop | {push, lex_state()}.
-
-%% @doc Compile a lexer specification. Takes the regexps for each state and
-%% combines them into a single big regexp that is then compiled with re:compile/1.
-%% Note: contrary to lexer generators like leex, we don't have longest match
-%% semantics (since this isn't supported by re). Use override/2 instead.
-spec compile([{lex_state(), [token_spec()]}]) -> lexer().
-compile(TokenSpecs) ->
-    [{S, compile_spec(Spec)} || {S, Spec} <- TokenSpecs].
-
-compile_spec(TokenSpecs) ->
-    WithIxs     = lists:zip(lists:seq(1, length(TokenSpecs)), TokenSpecs),
-    {ok, Regex} = re:compile(["^(", name(0), string:join([ ["(", name(I), R, ")"] || {I, {R, _}} <- WithIxs ], "|"),")"]),
-    Actions     = [ Fun || {_, Fun} <- TokenSpecs ],
-    fun ("", _Pos) -> end_of_file;
-        (S, Pos)  ->
-            case re:run(S, Regex, [{capture, all_names}]) of
-                {match, [{0, N} | Capture]} ->
-                    Index        = 1 + length(lists:takewhile(fun({P, _}) -> P == -1 end, Capture)),
-                    Action       = lists:nth(Index, Actions),
-                    {TokS, Rest} = lists:split(N, S),
-                    Tok          = Action(TokS, Pos),
-                    {ok, Tok, Rest, next_pos(TokS, Pos)};
-                nomatch ->
-                    error
-            end
-    end.
-```
-
-
-
-# How does sophia compilation work
@@ -0,0 +1,569 @@
+# Sophia Frequently Questioned Answers
+
+- Created: 2026-03-30
+- Authors: Peter Harpending `<peterharpending@qpq.swiss>`
+- Last Modified: 2026-06-04
+
+# References
+
+- [Sophia Compiler][so]
+- [Sophia docs][so-docs]
+- [Protocol docs](https://git.qpq.swiss/QPQ-AG/protocol)
+- [GSC][gsc]
+- [GSC Token definition](https://git.qpq.swiss/QPQ-AG/gsc/src/commit/ba70aace96ed73138496744f7d90c2666428eafc/include/gsc.hrl#L45-L50)
+
+[gsc]: https://git.qpq.swiss/QPQ-AG/gsc
+[so]: https://git.qpq.swiss/QPQ-AG/sophia
+[so/docs]: https://git.qpq.swiss/QPQ-AG/sophia/src/branch/master/docs
+
+# GSC
+
+[GSC (= "gajumaru sophia compiler")][gsc] is an experimental
+work-in-progress maybe-will-be-finished-maybe-won't Sophia compiler
+that I (PRH) wrote in an effort to experiment with and document the
+Sophia language.
+
+It is used a lot to illustrate things in this document, so you might
+want to download it and get it to work on your machine. The goal in
+version 0.1 is to match the exact behavior of the [legacy Sophia
+compiler][so].
+
+I wrote gsc mostly because I got nerdsniped by the problems that gsc
+must solve in order to work. In the interest of retroactively
+justifying being nerdsniped, I will note that the legacy compiler
+
+1.  is *extremely* central to Gajumaru's trust model
+2.  has many serious-to-semi-serious bugs/warts/edge-cases which
+    (prior to this work) were either unknown or
+    known-but-not-documented; e.g., unterminated block comments at
+    the end of files are admissible provided what precedes is a valid
+    Sophia contract:
+
+    ```sophia
+    contract Test =
+        type state = unit
+        entrypoint init() : state =
+            ()
+    /*
+    according to the legacy sophia compiler, this is a totally 100%
+    legal sophia contract that ends with an unterminated block
+    comment
+    ```
+
+
+# Architecture of the Sophia Compiler
+
+First some disclaimers:
+
+1. **Compilers are _NOT_ magic incomprehensible black boxes** that
+   are totally inaccessible to ordinary programmers.  (If you
+   encounter one that is, that says more about the compiler and its
+   authors than it does about you...). Compilers simply translate a
+   well-specified input format into a well-specified output format.
+
+   **Compilers are just ordinary pieces of software that work the
+   same way every other piece of software does.**
+
+2.  Like all other types of software, **compilers have bugs and
+    strange unexpected corner cases**. A (the?) purpose of this
+    document is to write down all such cases that I have encountered
+    thus far in this nerdsnipe adventure.
+
+Most compilers have some variation of the following architecture:
+
+1.  **Tokenization** (also called **lexical analysis**); this step
+    takes the flat array of input characters found in the source code
+    and discovers the "chunk boundaries" in the file:
+
+    ![](./uploads/tokens-c.png)
+
+    Each chunk is called a "token".
+
+2.  **Parsing** (also called **syntax analysis**); this step takes
+    the flat sequence of tokens, and arranges it into a hierarchy
+    (usually called an "abstract syntax tree" or "AST").
+
+    The set of rules regarding how the signal is transformed into the
+    abstract syntax tree is called the **grammar** of the language.
+
+    ```
+    source:
+        the quick brown fox jumps over the lazy dog
+    signal:
+        ["the", "quick", "brown", "fox", "jumps",
+         "over", "the", "lazy", "dog"]
+    ast:
+        (Sentence
+            (NounPhrase
+                (determiner "the")
+                (adjective "quick")
+                (adjective "brown")
+                (noun "fox"))
+            (VerbPhrase
+                (verb "jumps")
+                (PrepositionalPhrase
+                    (preposition "over")
+                    (NounPhrase
+                        (determiner "the")
+                        (adjective "lazy")
+                        (noun "dog")))))
+    ```
+
+    This is the first step in which we think of a language in terms
+    of its **structure** rather than simply being a sequence of
+    words/tokens.
+
+3.  **Semantic analysis**: the compiler transforms the abstract
+    syntax tree through a sequence of **intermediate
+    representations** (**IR**s).
+
+    This is where compiler engineering gets interesting, and factors
+    like artistic choice and taste start to dominate. Different
+    optimizations occur at different levels of intermediate
+    represntation. The structure of this meta-step depends heavily on
+    the source and target languages, problem domains, goals of the
+    specific compiler, etc.
+
+    This is the step in which we think of phrases in the language in
+    terms of their **meaning** rather than in some strict notion of
+    valid vs. invalid.
+
+4.  **Code generation**: once the compiler has completed its analysis
+    of the input data, and figured out in some precise way what the
+    author of the input was attempting to express notionally, it's
+    finally time to express said notion in the target language.
+
+At the time of writing (June 2026), only GSC's tokenizer has been
+fully ironed out and thoroughly tested, the discussion of which will
+constitute the remainder of this document.
+
+# PITFALL WARNING! TERMINOLOGY COLLISION re "tokens" vs. gsc "signal"
+
+What most compilers call "tokens", gsc calls "signal".
+
+GSC classifies tokens into "signal" and "noise"; "noise" means
+comments and whitespace, and "signal" is everything else.
+
+Most compilers discard "noise" tokens (comments and whitespace). GSC
+retains them for two reasons:
+
+1. sanity-checking to make sure information isn't lost on accident;
+   e.g. one of gsc's tests
+2. future-proofing in case we want to add Python/Lisp
+style doc comments as a language feature down the line.
+
+```python
+def foo():
+    "this is a doc comment for foo"
+    print("hi from foo")
+```
+
+![](./python-doc1.png)
+
+![](./python-doc2.png)
+
+However for non-bikeshed compiler tasks (figuring out what the code
+is supposed to *do* and then expressing that in the target language),
+noise tokens are entirely irrelevant.
+
+# What is a token?
+
+Tokens are the "chunk boundaries" of source files.
+
+![](./uploads/tokens-c.png)
+
+This is roughly analogous to "word boundaries" in natural language;
+we can hack together a string-splitting function in the Erlang shell
+to illustrate the notion:
+
+```erlang
+11> Intersperse = fun I([], _Sep) -> []; I([Last], _Sep) -> [Last]; I([One | More], Sep) -> [One, Sep | I(More, Sep)] end.
+#Fun<erl_eval.18.113135111>
+12> Intersperse(["foo", "bar", "baz"], " ").
+["foo"," ","bar"," ","baz"]
+13> string:tokens("foo bar baz", " ").
+["foo","bar","baz"]
+14> TokensEn = fun(SrcStr) -> Sep = " ", Signal = string:tokens(SrcStr, Sep), Tokens = Intersperse(Signal, Sep), Tokens end.
+#Fun<erl_eval.42.113135111>
+15> TokensEn("foo bar baz").
+["foo"," ","bar"," ","baz"]
+16> TokensEn("The quick brown fox jumped over the lazy dog").
+["The"," ","quick"," ","brown"," ","fox"," ","jumped"," ",
+ "over"," ","the"," ","lazy"," ","dog"]
+```
+
+You can see the pitfall regarding termionology collision present in
+the behavior of the Erlang standard library `string:tokens/2`
+function, which discards the separator characters:
+
+```erlang
+17> string:tokens("foo.bar.baz", ".").
+["foo","bar","baz"]
+18> string:tokens("foo.bar,baz", ",").
+["foo.bar","baz"]
+```
+
+# Sophia Tokens
+
+```erlang
+-type tk_shape()
+    :: bcom      % /* ... */
+     | lcom      % //
+     | ws        % whitespace
+     % literals
+     | char      % 'a'
+     | string    % "foo"
+     | int10     % 69_420
+     | int16     % 0xDEAD_BEEF
+     | bytes     % #DEAD_BEEF
+     | ak        % ak_ABC
+     | ct        % ct_ABC
+     | sg        % sg_ABC
+     % kwds/variables/etc
+     | id         % foo, foo_bar, foo_bar'baz'  _'foo'
+     | con        % Foo, Foo_Bar, FooBar
+     | qid        % Foo.Bar.baz
+     | qcon       % Foo.Bar.Baz
+     | tvar       % 'foo, 'foo_bar, '_'foo'_'bar'''
+     % kwds ops and sep are all collapsed by
+     % so_scan:scan down to eg {'contract', {420, 69}}
+     % where {420, 69} is the source location
+     % these are three different parsers
+     | kwd        % contract, interface, payable, etc
+     | op         % "=!<>+-*/:&|?~@^"
+     | sep      % ".." | oneof(",.;()[]{}")
+     % kwds and sep are kind of the same thing
+     % but i'll keep them separate now for my own sanity. ok
+     % i guess op or symbol or whatever is fine.
+     %
+     % not going to overthink. if having them separate
+     % becomes an issue it's easy enough to collapse. harder
+     % to separate afterward if collapsing is wrong.
+     .
+
+-type tk_pos() :: {Line :: pos_integer(), Col :: pos_integer()}.
+
+-record(tk,
+        {shape  :: tk_shape(),
+         pos    :: tk_pos(),
+         str :: string()}).
+
+-type tk() :: #tk{}.
+```
+
+Concretely:
+
+```sophia
+// Hello World Contract
+// Copyright (c) 2025 QPQ AG
+
+contract Hello =
+    type state = unit
+    entrypoint init(): state =
+        ()
+
+    entrypoint hello(): string =
+        "hello, world"
+```
+
+![](./uploads/tokens-c.png)
+
+```erlang
+[pharpend@desktop ioecs/gsc master] % gsc tokens test/ct/hello.aes
+{tk,lcom,{1,1},"// Hello World Contract"}
+{tk,ws,{1,24},"\n"}
+{tk,lcom,{2,1},"// Copyright (c) 2025 QPQ AG"}
+{tk,ws,{2,29},"\n\n"}
+{tk,kwd,{4,1},"contract"}
+{tk,ws,{4,9}," "}
+{tk,con,{4,10},"Hello"}
+{tk,ws,{4,15}," "}
+{tk,op,{4,16},"="}
+{tk,ws,{4,17},"\n    "}
+{tk,kwd,{5,5},"type"}
+{tk,ws,{5,9}," "}
+{tk,id,{5,10},"state"}
+{tk,ws,{5,15}," "}
+{tk,op,{5,16},"="}
+{tk,ws,{5,17}," "}
+{tk,id,{5,18},"unit"}
+{tk,ws,{5,22},"\n    "}
+{tk,kwd,{6,5},"entrypoint"}
+{tk,ws,{6,15}," "}
+{tk,id,{6,16},"init"}
+{tk,sep,{6,20},"("}
+{tk,sep,{6,21},")"}
+{tk,op,{6,22},":"}
+{tk,ws,{6,23}," "}
+{tk,id,{6,24},"state"}
+{tk,ws,{6,29}," "}
+{tk,op,{6,30},"="}
+{tk,ws,{6,31},"\n        "}
+{tk,sep,{7,9},"("}
+{tk,sep,{7,10},")"}
+{tk,ws,{7,11},"\n\n    "}
+{tk,kwd,{9,5},"entrypoint"}
+{tk,ws,{9,15}," "}
+{tk,id,{9,16},"hello"}
+{tk,sep,{9,21},"("}
+{tk,sep,{9,22},")"}
+{tk,op,{9,23},":"}
+{tk,ws,{9,24}," "}
+{tk,id,{9,25},"string"}
+{tk,ws,{9,31}," "}
+{tk,op,{9,32},"="}
+{tk,ws,{9,33},"\n        "}
+{tk,string,{10,9},"\"hello, world\""}
+{tk,ws,{10,23},"\n"}
+```
+
+
+
+# Defining Events in interfaces
+
+apparently this is legal syntax but the point of this is unclear.
+
+# Can there be the same function name with different arities?
+
+# What happens if you delete a non-existent key from a map?
+
+# How does sophia compilation work
+
+
+From commit `dbab49936daad7d82bae7cf7336b1ce82e7ab779`
+
+```erlang
+% so_compiler.erl:84
+-spec file(string()) -> {ok, map()} | {error, [so_errors:error()]}.
+file(Filename) ->
+    file(Filename, []).
+
+-spec file(string(), options()) -> {ok, map()} | {error, [so_errors:error()]}.
+file(File, Options0) ->
+    Options = add_include_path(File, Options0),
+    case read_contract(File) of
+        {ok, Bin} ->
+            SrcDir = so_utils:canonical_dir(filename:dirname(File)),
+            from_string(Bin, [{src_file, File}, {src_dir, SrcDir} | Options]);
+        {error, Error} ->
+            Msg = lists:flatten([File,": ",file:format_error(Error)]),
+            {error, [so_errors:new(file_error, Msg)]}
+    end.
+
+-spec from_string(binary() | string(), options()) -> {ok, map()} | {error, [so_errors:error()]}.
+from_string(ContractBin, Options) when is_binary(ContractBin) ->
+    from_string(binary_to_list(ContractBin), Options);
+from_string(ContractString, Options) ->
+    try
+        from_string1(ContractString, Options)
+    catch
+        throw:{error, Errors} -> {error, Errors}
+    end.
+
+from_string1(ContractString, Options) ->
+    #{ fcode := FCode
+     , fcode_env := FCodeEnv
+     , folded_typed_ast := FoldedTypedAst
+     , warnings := Warnings } = string_to_code(ContractString, Options),
+    #{ child_con_env := ChildContracts } = FCodeEnv,
+    SavedFreshNames = maps:get(saved_fresh_names, FCodeEnv, #{}),
+    FateCode = so_fcode_to_fate:compile(ChildContracts, FCode, SavedFreshNames, Options),
+    pp_assembler(FateCode, Options),
+    ByteCode = gmb_fate_code:serialize(FateCode, []),
+    {ok, Version} = version(),
+    Res = #{byte_code => ByteCode,
+            compiler_version => Version,
+            contract_source => ContractString,
+            type_info => [],
+            fate_code => FateCode,
+            abi_version => gmb_fate_abi:abi_version(),
+            payable => maps:get(payable, FCode),
+            warnings => Warnings
+           },
+    {ok, maybe_generate_aci(Res, FoldedTypedAst, Options)}.
+
+```
+
+So a lot is going on in `string_to_code/2`
+
+```erlang
+-spec string_to_code(string(), options()) -> map().
+string_to_code(ContractString, Options) ->
+    Ast = parse(ContractString, Options),
+    pp_sophia_code(Ast, Options),
+    pp_ast(Ast, Options),
+    {TypeEnv, FoldedTypedAst, UnfoldedTypedAst, Warnings} = so_ast_infer_types:infer(Ast, [return_env | Options]),
+    pp_typed_ast(UnfoldedTypedAst, Options),
+    {Env, Fcode} = so_ast_to_fcode:ast_to_fcode(UnfoldedTypedAst, [{original_src, ContractString}|Options]),
+    #{ fcode => Fcode
+    ,  fcode_env => Env
+    ,  unfolded_typed_ast => UnfoldedTypedAst
+    ,  folded_typed_ast => FoldedTypedAst
+    ,  type_env  => TypeEnv
+    ,  ast => Ast
+    ,  warnings => Warnings }.
+
+
+-spec parse(string(), so_compiler:options()) -> none() | so_syntax:ast().
+parse(Text, Options) ->
+    parse(Text, sets:new(), Options).
+
+-spec parse(string(), sets:set(), so_compiler:options()) -> none() | so_syntax:ast().
+parse(Text, Included, Options) ->
+    so_parser:string(Text, Included, Options).
+```
+
+So we get an AST from `so_parser:string/3`
+
+```
+%% so_parser.erl
+-spec string(string(), sets:set(include_hash()), so_compiler:options()) -> parse_result().
+string(String, Included, Opts) ->
+    AST = run_parser(file(), String, Opts),
+    case expand_includes(AST, Included, Opts) of
+        {ok, AST1}   -> AST1;
+        {error, Err} -> parse_error(Err)
+    end.
+
+
+run_parser(P, Inp) ->
+    escape_errors(parse_and_scan(P, Inp, [])).
+run_parser(P, Inp, Opts) ->
+    escape_errors(parse_and_scan(P, Inp, Opts)).
+
+parse_and_scan(P, S, Opts) ->
+    set_current_file(proplists:get_value(src_file, Opts, no_file)),
+    set_current_dir(proplists:get_value(src_dir, Opts, no_file)),
+    set_current_include_type(proplists:get_value(include_type, Opts, none)),
+    case so_scan:scan(S) of
+        {ok, Tokens} -> so_parse_lib:parse(P, Tokens);
+        {error, {{Input, Pos}, _}} ->
+            {error, {Pos, scan_error, Input}}
+    end.
+
+```
+
+So there's a lot of metadata being kept, but the key part is the call to
+`so_scan:scan/1`
+
+```erl
+lexer() ->
+    Number   = fun(Digit) -> [Digit, "+(_", Digit, "+)*"] end,
+    DIGIT    = "[0-9]",
+    HEXDIGIT = "[0-9a-fA-F]",
+    LOWER    = "[a-z_]",
+    UPPER    = "[A-Z]",
+    CON      = [UPPER, "[a-zA-Z0-9_]*"],
+    INT      = Number(DIGIT),
+    HEX      = ["0x", Number(HEXDIGIT)],
+    BYTES    = ["#", Number(HEXDIGIT)],
+    WS       = "[\\000-\\ ]+",
+    ID       = [LOWER, "[a-zA-Z0-9_']*"],
+    TVAR     = ["'", ID],
+    QID      = ["(", CON, "\\.)+", ID],
+    QCON     = ["(", CON, "\\.)+", CON],
+    OP       = "[=!<>+\\-*/:&|?~@^]+",
+    %% Five cases for a character
+    %%  * 1 7-bit ascii, not \ or '
+    %%  * 2-4 8-bit values (UTF8)
+    %%  * \ followed by a known modifier [aernrtv]
+    %%  * \xhh
+    %%  * \x{hhh...}
+    CHAR     = "'(([\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f])|([\\x00-\\xff][\\x80-\\xff]{1,3})|(\\\\[befnrtv'\\\\])|(\\\\x[0-9a-fA-F]{2,2})|(\\\\x\\{[0-9a-fA-F]*\\}))'",
+    STRING   = "\"([^\"\\\\]|(\\\\.))*\"",
+
+    CommentStart = {"/\\*", push(comment, skip())},
+    CommentRules =
+        [ CommentStart
+        , {"\\*/",        pop(skip())}
+        , {"[^/*]+|[/*]", skip()} ],
+
+    Keywords = ["contract", "include", "let", "switch", "type", "record", "datatype", "if", "elif", "else", "function",
+                "stateful", "payable", "true", "false", "mod", "public", "entrypoint", "private", "indexed", "namespace",
+                "interface", "main", "using", "as", "for", "hiding", "band", "bor", "bxor", "bnot"
+               ],
+    KW = string:join(Keywords, "|"),
+
+    Rules =
+          %% Comments and whitespace
+        [ CommentStart
+        , {"//.*", skip()}
+        , {WS,     skip()}
+
+          %% Special characters
+        , {"\\.\\.|[,.;()\\[\\]{}]", symbol()}
+
+          %% Literals
+        , {CHAR,   token(char,   fun parse_char/1)}
+        , {STRING, token(string, fun parse_string/1)}
+        , {HEX,    token(hex,    fun parse_hex/1)}
+        , {INT,    token(int,    fun parse_int/1)}
+        , {BYTES,  token(bytes,  fun parse_bytes/1)}
+
+          %% Identifiers (qualified first!)
+        , {QID,   token(qid,  fun(S) -> string:tokens(S, ".") end)}
+        , {QCON,  token(qcon, fun(S) -> string:tokens(S, ".") end)}
+        , {TVAR,  token(tvar)}
+        , override({ID, token(id)}, {KW, symbol()})    %% Keywords override identifiers. Need to
+        , {CON, token(con)}                            %% use override to avoid lexing "lettuce"
+                                                       %% as ['let', {id, "tuce"}].
+          %% Operators
+        , {OP, symbol()}
+        ],
+
+    [{code, Rules}, {comment, CommentRules}].
+
+scan(String) ->
+    Lexer = so_scan_lib:compile(lexer()),
+    so_scan_lib:string(Lexer, code, String).
+```
+
+OK. let's look at `so_scan_lib`
+
+```erl
+-type regex()     :: iodata() | unicode:charlist().
+-type pos()       :: {integer(), integer()}.
+-type lex_state() :: atom().
+-type token()     :: {atom(), pos(), term()} | {atom(), pos()}.
+
+-type token_spec()     :: {regex(), token_action()}.
+-opaque token_action() :: fun((string(), pos()) -> {tok_result(), state_change()}).
+
+-opaque lexer() :: [{lex_state(),
+                     fun((string(), pos()) -> {ok, tok_result(), string(), pos()}
+                                            | end_of_file | error)}].
+
+%% -- Internal types --
+-type tok_result()   :: {token, token()} | skip.
+-type state_change() :: none | pop | {push, lex_state()}.
+
+%% @doc Compile a lexer specification. Takes the regexps for each state and
+%% combines them into a single big regexp that is then compiled with re:compile/1.
+%% Note: contrary to lexer generators like leex, we don't have longest match
+%% semantics (since this isn't supported by re). Use override/2 instead.
+-spec compile([{lex_state(), [token_spec()]}]) -> lexer().
+compile(TokenSpecs) ->
+    [{S, compile_spec(Spec)} || {S, Spec} <- TokenSpecs].
+
+compile_spec(TokenSpecs) ->
+    WithIxs     = lists:zip(lists:seq(1, length(TokenSpecs)), TokenSpecs),
+    {ok, Regex} = re:compile(["^(", name(0), string:join([ ["(", name(I), R, ")"] || {I, {R, _}} <- WithIxs ], "|"),")"]),
+    Actions     = [ Fun || {_, Fun} <- TokenSpecs ],
+    fun ("", _Pos) -> end_of_file;
+        (S, Pos)  ->
+            case re:run(S, Regex, [{capture, all_names}]) of
+                {match, [{0, N} | Capture]} ->
+                    Index        = 1 + length(lists:takewhile(fun({P, _}) -> P == -1 end, Capture)),
+                    Action       = lists:nth(Index, Actions),
+                    {TokS, Rest} = lists:split(N, S),
+                    Tok          = Action(TokS, Pos),
+                    {ok, Tok, Rest, next_pos(TokS, Pos)};
+                nomatch ->
+                    error
+            end
+    end.
+```
+
+
+
+# How does sophia compilation work
@@ -4,88 +4,6 @@ documenting for myself

 ## Sophia syntax highlighting

-todo. it's on github somewhere, not hard to find
+See: <https://github.com/yinkaenoch/sophia-vim-syntax>

-## fuzzy finding plugin
-
-this is annoying and requires like 10 minutes of setup.
-
-BUT this is super helpful in huge repositories such as the node codebase
-
-```
-sudo apt install bat fd-find fzf ripgrep
-```
-
-(devuan excalibur)
-
-ripgrep is optional, craig, but the vim plugin needs it if you want to search
-for regexes *inside* files
-
-say you're trying to quickly remember what the fuck `gmser_id:id()` is. Like is
-that the record or is that the 33-byte tagged public key? I can't remember and
-neither can you
-
-![](./uploads/ripgrep-vim.png)
-
-this saves you like 15 seconds and a bunch of context switching. each time
-
-the plugin is super annoying to install but basically don't follow any of the
-instructions in the repo. just clone the `fzf.vim` repo on github (google) to
-`~/.vim/bundle/fzf.vim`.
-
-you also need to tell vim to load the `.vim` file that ships with the package
-
-```
-[pharpend@picklet ioecs/GajuDesk master] % dpkg -L fzf
-/.
-/usr
-/usr/bin
-/usr/bin/fzf
-/usr/bin/fzf-tmux
-/usr/share
-/usr/share/doc
-/usr/share/doc/fzf
-/usr/share/doc/fzf/README-VIM.md.gz
-/usr/share/doc/fzf/README.Debian
-/usr/share/doc/fzf/README.md.gz
-/usr/share/doc/fzf/changelog.Debian.amd64.gz
-/usr/share/doc/fzf/changelog.Debian.gz
-/usr/share/doc/fzf/changelog.gz
-/usr/share/doc/fzf/copyright
-/usr/share/doc/fzf/examples
-/usr/share/doc/fzf/examples/completion.bash
-/usr/share/doc/fzf/examples/completion.zsh
-/usr/share/doc/fzf/examples/fzf.vim
-/usr/share/doc/fzf/examples/key-bindings.bash
-/usr/share/doc/fzf/examples/key-bindings.fish
-/usr/share/doc/fzf/examples/key-bindings.zsh
-/usr/share/doc/fzf/examples/plugin
-/usr/share/fish
-/usr/share/fish/vendor_functions.d
-/usr/share/fish/vendor_functions.d/fzf_key_bindings.fish
-/usr/share/man
-/usr/share/man/man1
-/usr/share/man/man1/fzf-tmux.1.gz
-/usr/share/man/man1/fzf.1.gz
-/usr/share/doc/fzf/examples/plugin/fzf.vim
-```
-
-last file there. put that file at `~/.vim/autoload/fzf.vim`
-
-should just work.
-
- `:Files` opens the fuzzy file finder
- `:Rg` is the interactive grep thing shown above
-
-i have this vimrc:
-
-```vim
-let $FZF_DEFAULT_COMMAND = 'fdfind --type f'
-noremap <C-e> :Files<CR>
-noremap <C-r> :Rg<CR>
-```
-
-the fdfind thing means fuzzy find doesn't surface files in your .gitignore
-(e.g. beam files, `_build` insanity)
-
-will try and see
+Read the link there and do the needful