stuff

Peter Harpending
2026-06-08 19:02:27 -07:00
parent 70643184c3
commit 40f4ce1e87
9 changed files with 572 additions and 344 deletions
+1 -1
@@ -26,7 +26,7 @@ Title | Brief Description
[[Serializations]] | Conventions for field order in Gajumaru data structures [[Serializations]] | Conventions for field order in Gajumaru data structures
[[Smart Contracts]] | Terminology [[Smart Contracts]] | Terminology
[[Sophia]] | Introduction to Sophia, the Gajumaru smart contract language [[Sophia]] | Introduction to Sophia, the Gajumaru smart contract language
[[Sophia FAQ]] | what it says [[Sophia FQA]] | what it says
[[State Channels]] | Overview and characteristics [[State Channels]] | Overview and characteristics
[[Testnet Node Setup]] | Tech support [[Testnet Node Setup]] | Tech support
[[Transaction]] | Terminology [[Transaction]] | Terminology
-259
@@ -1,259 +0,0 @@
# Sophia FAQ
- Created: 2026-03-30
- Authors: Peter Harpending `<peterharpending@qpq.swiss>`
- Last Modified: 2026-04-07
# References
- [Sophia docs](https://git.qpq.swiss/QPQ-AG/sophia/src/branch/master/docs)
- [Protocol docs](https://git.qpq.swiss/QPQ-AG/protocol)
# Defining Events in interfaces
apparently this is legal syntax but the point of this is unclear.
# Can there be the same function name with different arities?
# What happens if you delete a non-existent key from a map?
# How does sophia compilation work
From commit `dbab49936daad7d82bae7cf7336b1ce82e7ab779`
```erlang
% so_compiler.erl:84
-spec file(string()) -> {ok, map()} | {error, [so_errors:error()]}.
file(Filename) ->
file(Filename, []).
-spec file(string(), options()) -> {ok, map()} | {error, [so_errors:error()]}.
file(File, Options0) ->
Options = add_include_path(File, Options0),
case read_contract(File) of
{ok, Bin} ->
SrcDir = so_utils:canonical_dir(filename:dirname(File)),
from_string(Bin, [{src_file, File}, {src_dir, SrcDir} | Options]);
{error, Error} ->
Msg = lists:flatten([File,": ",file:format_error(Error)]),
{error, [so_errors:new(file_error, Msg)]}
end.
-spec from_string(binary() | string(), options()) -> {ok, map()} | {error, [so_errors:error()]}.
from_string(ContractBin, Options) when is_binary(ContractBin) ->
from_string(binary_to_list(ContractBin), Options);
from_string(ContractString, Options) ->
try
from_string1(ContractString, Options)
catch
throw:{error, Errors} -> {error, Errors}
end.
from_string1(ContractString, Options) ->
#{ fcode := FCode
, fcode_env := FCodeEnv
, folded_typed_ast := FoldedTypedAst
, warnings := Warnings } = string_to_code(ContractString, Options),
#{ child_con_env := ChildContracts } = FCodeEnv,
SavedFreshNames = maps:get(saved_fresh_names, FCodeEnv, #{}),
FateCode = so_fcode_to_fate:compile(ChildContracts, FCode, SavedFreshNames, Options),
pp_assembler(FateCode, Options),
ByteCode = gmb_fate_code:serialize(FateCode, []),
{ok, Version} = version(),
Res = #{byte_code => ByteCode,
compiler_version => Version,
contract_source => ContractString,
type_info => [],
fate_code => FateCode,
abi_version => gmb_fate_abi:abi_version(),
payable => maps:get(payable, FCode),
warnings => Warnings
},
{ok, maybe_generate_aci(Res, FoldedTypedAst, Options)}.
```
So a lot is going on in `string_to_code/2`
```erlang
-spec string_to_code(string(), options()) -> map().
string_to_code(ContractString, Options) ->
Ast = parse(ContractString, Options),
pp_sophia_code(Ast, Options),
pp_ast(Ast, Options),
{TypeEnv, FoldedTypedAst, UnfoldedTypedAst, Warnings} = so_ast_infer_types:infer(Ast, [return_env | Options]),
pp_typed_ast(UnfoldedTypedAst, Options),
{Env, Fcode} = so_ast_to_fcode:ast_to_fcode(UnfoldedTypedAst, [{original_src, ContractString}|Options]),
#{ fcode => Fcode
, fcode_env => Env
, unfolded_typed_ast => UnfoldedTypedAst
, folded_typed_ast => FoldedTypedAst
, type_env => TypeEnv
, ast => Ast
, warnings => Warnings }.
-spec parse(string(), so_compiler:options()) -> none() | so_syntax:ast().
parse(Text, Options) ->
parse(Text, sets:new(), Options).
-spec parse(string(), sets:set(), so_compiler:options()) -> none() | so_syntax:ast().
parse(Text, Included, Options) ->
so_parser:string(Text, Included, Options).
```
So we get an AST from `so_parser:string/3`
```
%% so_parser.erl
-spec string(string(), sets:set(include_hash()), so_compiler:options()) -> parse_result().
string(String, Included, Opts) ->
AST = run_parser(file(), String, Opts),
case expand_includes(AST, Included, Opts) of
{ok, AST1} -> AST1;
{error, Err} -> parse_error(Err)
end.
run_parser(P, Inp) ->
escape_errors(parse_and_scan(P, Inp, [])).
run_parser(P, Inp, Opts) ->
escape_errors(parse_and_scan(P, Inp, Opts)).
parse_and_scan(P, S, Opts) ->
set_current_file(proplists:get_value(src_file, Opts, no_file)),
set_current_dir(proplists:get_value(src_dir, Opts, no_file)),
set_current_include_type(proplists:get_value(include_type, Opts, none)),
case so_scan:scan(S) of
{ok, Tokens} -> so_parse_lib:parse(P, Tokens);
{error, {{Input, Pos}, _}} ->
{error, {Pos, scan_error, Input}}
end.
```
So there's a lot of metadata being kept, but the key part is the call to
`so_scan:scan/1`
```erl
lexer() ->
Number = fun(Digit) -> [Digit, "+(_", Digit, "+)*"] end,
DIGIT = "[0-9]",
HEXDIGIT = "[0-9a-fA-F]",
LOWER = "[a-z_]",
UPPER = "[A-Z]",
CON = [UPPER, "[a-zA-Z0-9_]*"],
INT = Number(DIGIT),
HEX = ["0x", Number(HEXDIGIT)],
BYTES = ["#", Number(HEXDIGIT)],
WS = "[\\000-\\ ]+",
ID = [LOWER, "[a-zA-Z0-9_']*"],
TVAR = ["'", ID],
QID = ["(", CON, "\\.)+", ID],
QCON = ["(", CON, "\\.)+", CON],
OP = "[=!<>+\\-*/:&|?~@^]+",
%% Five cases for a character
%% * 1 7-bit ascii, not \ or '
%% * 2-4 8-bit values (UTF8)
%% * \ followed by a known modifier [aernrtv]
%% * \xhh
%% * \x{hhh...}
CHAR = "'(([\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f])|([\\x00-\\xff][\\x80-\\xff]{1,3})|(\\\\[befnrtv'\\\\])|(\\\\x[0-9a-fA-F]{2,2})|(\\\\x\\{[0-9a-fA-F]*\\}))'",
STRING = "\"([^\"\\\\]|(\\\\.))*\"",
CommentStart = {"/\\*", push(comment, skip())},
CommentRules =
[ CommentStart
, {"\\*/", pop(skip())}
, {"[^/*]+|[/*]", skip()} ],
Keywords = ["contract", "include", "let", "switch", "type", "record", "datatype", "if", "elif", "else", "function",
"stateful", "payable", "true", "false", "mod", "public", "entrypoint", "private", "indexed", "namespace",
"interface", "main", "using", "as", "for", "hiding", "band", "bor", "bxor", "bnot"
],
KW = string:join(Keywords, "|"),
Rules =
%% Comments and whitespace
[ CommentStart
, {"//.*", skip()}
, {WS, skip()}
%% Special characters
, {"\\.\\.|[,.;()\\[\\]{}]", symbol()}
%% Literals
, {CHAR, token(char, fun parse_char/1)}
, {STRING, token(string, fun parse_string/1)}
, {HEX, token(hex, fun parse_hex/1)}
, {INT, token(int, fun parse_int/1)}
, {BYTES, token(bytes, fun parse_bytes/1)}
%% Identifiers (qualified first!)
, {QID, token(qid, fun(S) -> string:tokens(S, ".") end)}
, {QCON, token(qcon, fun(S) -> string:tokens(S, ".") end)}
, {TVAR, token(tvar)}
, override({ID, token(id)}, {KW, symbol()}) %% Keywords override identifiers. Need to
, {CON, token(con)} %% use override to avoid lexing "lettuce"
%% as ['let', {id, "tuce"}].
%% Operators
, {OP, symbol()}
],
[{code, Rules}, {comment, CommentRules}].
scan(String) ->
Lexer = so_scan_lib:compile(lexer()),
so_scan_lib:string(Lexer, code, String).
```
OK. let's look at `so_scan_lib`
```erl
-type regex() :: iodata() | unicode:charlist().
-type pos() :: {integer(), integer()}.
-type lex_state() :: atom().
-type token() :: {atom(), pos(), term()} | {atom(), pos()}.
-type token_spec() :: {regex(), token_action()}.
-opaque token_action() :: fun((string(), pos()) -> {tok_result(), state_change()}).
-opaque lexer() :: [{lex_state(),
fun((string(), pos()) -> {ok, tok_result(), string(), pos()}
| end_of_file | error)}].
%% -- Internal types --
-type tok_result() :: {token, token()} | skip.
-type state_change() :: none | pop | {push, lex_state()}.
%% @doc Compile a lexer specification. Takes the regexps for each state and
%% combines them into a single big regexp that is then compiled with re:compile/1.
%% Note: contrary to lexer generators like leex, we don't have longest match
%% semantics (since this isn't supported by re). Use override/2 instead.
-spec compile([{lex_state(), [token_spec()]}]) -> lexer().
compile(TokenSpecs) ->
[{S, compile_spec(Spec)} || {S, Spec} <- TokenSpecs].
compile_spec(TokenSpecs) ->
WithIxs = lists:zip(lists:seq(1, length(TokenSpecs)), TokenSpecs),
{ok, Regex} = re:compile(["^(", name(0), string:join([ ["(", name(I), R, ")"] || {I, {R, _}} <- WithIxs ], "|"),")"]),
Actions = [ Fun || {_, Fun} <- TokenSpecs ],
fun ("", _Pos) -> end_of_file;
(S, Pos) ->
case re:run(S, Regex, [{capture, all_names}]) of
{match, [{0, N} | Capture]} ->
Index = 1 + length(lists:takewhile(fun({P, _}) -> P == -1 end, Capture)),
Action = lists:nth(Index, Actions),
{TokS, Rest} = lists:split(N, S),
Tok = Action(TokS, Pos),
{ok, Tok, Rest, next_pos(TokS, Pos)};
nomatch ->
error
end
end.
```
# How does sophia compilation work
+569
@@ -0,0 +1,569 @@
# Sophia Frequently Questioned Answers
- Created: 2026-03-30
- Authors: Peter Harpending `<peterharpending@qpq.swiss>`
- Last Modified: 2026-06-04
# References
- [Sophia Compiler][so]
- [Sophia docs][so-docs]
- [Protocol docs](https://git.qpq.swiss/QPQ-AG/protocol)
- [GSC][gsc]
- [GSC Token definition](https://git.qpq.swiss/QPQ-AG/gsc/src/commit/ba70aace96ed73138496744f7d90c2666428eafc/include/gsc.hrl#L45-L50)
[gsc]: https://git.qpq.swiss/QPQ-AG/gsc
[so]: https://git.qpq.swiss/QPQ-AG/sophia
[so/docs]: https://git.qpq.swiss/QPQ-AG/sophia/src/branch/master/docs
# GSC
[GSC (= "gajumaru sophia compiler")][gsc] is an experimental
work-in-progress maybe-will-be-finished-maybe-won't Sophia compiler
that I (PRH) wrote in an effort to experiment with and document the
Sophia language.
It is used a lot to illustrate things in this document, so you might
want to download it and get it to work on your machine. The goal in
version 0.1 is to match the exact behavior of the [legacy Sophia
compiler][so].
I wrote gsc mostly because I got nerdsniped by the problems that gsc
must solve in order to work. In the interest of retroactively
justifying being nerdsniped, I will note that the legacy compiler
1. is *extremely* central to Gajumaru's trust model
2. has many serious-to-semi-serious bugs/warts/edge-cases which
(prior to this work) were either unknown or
known-but-not-documented; e.g., unterminated block comments at
the end of files are admissible provided what precedes is a valid
Sophia contract:
```sophia
contract Test =
type state = unit
entrypoint init() : state =
()
/*
according to the legacy sophia compiler, this is a totally 100%
legal sophia contract that ends with an unterminated block
comment
```
# Architecture of the Sophia Compiler
First some disclaimers:
1. **Compilers are _NOT_ magic incomprehensible black boxes** that
are totally inaccessible to ordinary programmers. (If you
encounter one that is, that says more about the compiler and its
authors than it does about you...). Compilers simply translate a
well-specified input format into a well-specified output format.
**Compilers are just ordinary pieces of software that work the
same way every other piece of software does.**
2. Like all other types of software, **compilers have bugs and
strange unexpected corner cases**. A (the?) purpose of this
document is to write down all such cases that I have encountered
thus far in this nerdsnipe adventure.
Most compilers have some variation of the following architecture:
1. **Tokenization** (also called **lexical analysis**); this step
takes the flat array of input characters found in the source code
and discovers the "chunk boundaries" in the file:
![](./uploads/tokens-c.png)
Each chunk is called a "token".
2. **Parsing** (also called **syntax analysis**); this step takes
the flat sequence of tokens, and arranges it into a hierarchy
(usually called an "abstract syntax tree" or "AST").
The set of rules regarding how the signal is transformed into the
abstract syntax tree is called the **grammar** of the language.
```
source:
the quick brown fox jumps over the lazy dog
signal:
["the", "quick", "brown", "fox", "jumps",
"over", "the", "lazy", "dog"]
ast:
(Sentence
(NounPhrase
(determiner "the")
(adjective "quick")
(adjective "brown")
(noun "fox"))
(VerbPhrase
(verb "jumps")
(PrepositionalPhrase
(preposition "over")
(NounPhrase
(determiner "the")
(adjective "lazy")
(noun "dog")))))
```
This is the first step in which we think of a language in terms
of its **structure** rather than simply being a sequence of
words/tokens.
3. **Semantic analysis**: the compiler transforms the abstract
syntax tree through a sequence of **intermediate
representations** (**IR**s).
This is where compiler engineering gets interesting, and factors
like artistic choice and taste start to dominate. Different
optimizations occur at different levels of intermediate
represntation. The structure of this meta-step depends heavily on
the source and target languages, problem domains, goals of the
specific compiler, etc.
This is the step in which we think of phrases in the language in
terms of their **meaning** rather than in some strict notion of
valid vs. invalid.
4. **Code generation**: once the compiler has completed its analysis
of the input data, and figured out in some precise way what the
author of the input was attempting to express notionally, it's
finally time to express said notion in the target language.
At the time of writing (June 2026), only GSC's tokenizer has been
fully ironed out and thoroughly tested, the discussion of which will
constitute the remainder of this document.
# PITFALL WARNING! TERMINOLOGY COLLISION re "tokens" vs. gsc "signal"
What most compilers call "tokens", gsc calls "signal".
GSC classifies tokens into "signal" and "noise"; "noise" means
comments and whitespace, and "signal" is everything else.
Most compilers discard "noise" tokens (comments and whitespace). GSC
retains them for two reasons:
1. sanity-checking to make sure information isn't lost on accident;
e.g. one of gsc's tests
2. future-proofing in case we want to add Python/Lisp
style doc comments as a language feature down the line.
```python
def foo():
"this is a doc comment for foo"
print("hi from foo")
```
![](./python-doc1.png)
![](./python-doc2.png)
However for non-bikeshed compiler tasks (figuring out what the code
is supposed to *do* and then expressing that in the target language),
noise tokens are entirely irrelevant.
# What is a token?
Tokens are the "chunk boundaries" of source files.
![](./uploads/tokens-c.png)
This is roughly analogous to "word boundaries" in natural language;
we can hack together a string-splitting function in the Erlang shell
to illustrate the notion:
```erlang
11> Intersperse = fun I([], _Sep) -> []; I([Last], _Sep) -> [Last]; I([One | More], Sep) -> [One, Sep | I(More, Sep)] end.
#Fun<erl_eval.18.113135111>
12> Intersperse(["foo", "bar", "baz"], " ").
["foo"," ","bar"," ","baz"]
13> string:tokens("foo bar baz", " ").
["foo","bar","baz"]
14> TokensEn = fun(SrcStr) -> Sep = " ", Signal = string:tokens(SrcStr, Sep), Tokens = Intersperse(Signal, Sep), Tokens end.
#Fun<erl_eval.42.113135111>
15> TokensEn("foo bar baz").
["foo"," ","bar"," ","baz"]
16> TokensEn("The quick brown fox jumped over the lazy dog").
["The"," ","quick"," ","brown"," ","fox"," ","jumped"," ",
"over"," ","the"," ","lazy"," ","dog"]
```
You can see the pitfall regarding termionology collision present in
the behavior of the Erlang standard library `string:tokens/2`
function, which discards the separator characters:
```erlang
17> string:tokens("foo.bar.baz", ".").
["foo","bar","baz"]
18> string:tokens("foo.bar,baz", ",").
["foo.bar","baz"]
```
# Sophia Tokens
```erlang
-type tk_shape()
:: bcom % /* ... */
| lcom % //
| ws % whitespace
% literals
| char % 'a'
| string % "foo"
| int10 % 69_420
| int16 % 0xDEAD_BEEF
| bytes % #DEAD_BEEF
| ak % ak_ABC
| ct % ct_ABC
| sg % sg_ABC
% kwds/variables/etc
| id % foo, foo_bar, foo_bar'baz' _'foo'
| con % Foo, Foo_Bar, FooBar
| qid % Foo.Bar.baz
| qcon % Foo.Bar.Baz
| tvar % 'foo, 'foo_bar, '_'foo'_'bar'''
% kwds ops and sep are all collapsed by
% so_scan:scan down to eg {'contract', {420, 69}}
% where {420, 69} is the source location
% these are three different parsers
| kwd % contract, interface, payable, etc
| op % "=!<>+-*/:&|?~@^"
| sep % ".." | oneof(",.;()[]{}")
% kwds and sep are kind of the same thing
% but i'll keep them separate now for my own sanity. ok
% i guess op or symbol or whatever is fine.
%
% not going to overthink. if having them separate
% becomes an issue it's easy enough to collapse. harder
% to separate afterward if collapsing is wrong.
.
-type tk_pos() :: {Line :: pos_integer(), Col :: pos_integer()}.
-record(tk,
{shape :: tk_shape(),
pos :: tk_pos(),
str :: string()}).
-type tk() :: #tk{}.
```
Concretely:
```sophia
// Hello World Contract
// Copyright (c) 2025 QPQ AG
contract Hello =
type state = unit
entrypoint init(): state =
()
entrypoint hello(): string =
"hello, world"
```
![](./uploads/tokens-c.png)
```erlang
[pharpend@desktop ioecs/gsc master] % gsc tokens test/ct/hello.aes
{tk,lcom,{1,1},"// Hello World Contract"}
{tk,ws,{1,24},"\n"}
{tk,lcom,{2,1},"// Copyright (c) 2025 QPQ AG"}
{tk,ws,{2,29},"\n\n"}
{tk,kwd,{4,1},"contract"}
{tk,ws,{4,9}," "}
{tk,con,{4,10},"Hello"}
{tk,ws,{4,15}," "}
{tk,op,{4,16},"="}
{tk,ws,{4,17},"\n "}
{tk,kwd,{5,5},"type"}
{tk,ws,{5,9}," "}
{tk,id,{5,10},"state"}
{tk,ws,{5,15}," "}
{tk,op,{5,16},"="}
{tk,ws,{5,17}," "}
{tk,id,{5,18},"unit"}
{tk,ws,{5,22},"\n "}
{tk,kwd,{6,5},"entrypoint"}
{tk,ws,{6,15}," "}
{tk,id,{6,16},"init"}
{tk,sep,{6,20},"("}
{tk,sep,{6,21},")"}
{tk,op,{6,22},":"}
{tk,ws,{6,23}," "}
{tk,id,{6,24},"state"}
{tk,ws,{6,29}," "}
{tk,op,{6,30},"="}
{tk,ws,{6,31},"\n "}
{tk,sep,{7,9},"("}
{tk,sep,{7,10},")"}
{tk,ws,{7,11},"\n\n "}
{tk,kwd,{9,5},"entrypoint"}
{tk,ws,{9,15}," "}
{tk,id,{9,16},"hello"}
{tk,sep,{9,21},"("}
{tk,sep,{9,22},")"}
{tk,op,{9,23},":"}
{tk,ws,{9,24}," "}
{tk,id,{9,25},"string"}
{tk,ws,{9,31}," "}
{tk,op,{9,32},"="}
{tk,ws,{9,33},"\n "}
{tk,string,{10,9},"\"hello, world\""}
{tk,ws,{10,23},"\n"}
```
# Defining Events in interfaces
apparently this is legal syntax but the point of this is unclear.
# Can there be the same function name with different arities?
# What happens if you delete a non-existent key from a map?
# How does sophia compilation work
From commit `dbab49936daad7d82bae7cf7336b1ce82e7ab779`
```erlang
% so_compiler.erl:84
-spec file(string()) -> {ok, map()} | {error, [so_errors:error()]}.
file(Filename) ->
file(Filename, []).
-spec file(string(), options()) -> {ok, map()} | {error, [so_errors:error()]}.
file(File, Options0) ->
Options = add_include_path(File, Options0),
case read_contract(File) of
{ok, Bin} ->
SrcDir = so_utils:canonical_dir(filename:dirname(File)),
from_string(Bin, [{src_file, File}, {src_dir, SrcDir} | Options]);
{error, Error} ->
Msg = lists:flatten([File,": ",file:format_error(Error)]),
{error, [so_errors:new(file_error, Msg)]}
end.
-spec from_string(binary() | string(), options()) -> {ok, map()} | {error, [so_errors:error()]}.
from_string(ContractBin, Options) when is_binary(ContractBin) ->
from_string(binary_to_list(ContractBin), Options);
from_string(ContractString, Options) ->
try
from_string1(ContractString, Options)
catch
throw:{error, Errors} -> {error, Errors}
end.
from_string1(ContractString, Options) ->
#{ fcode := FCode
, fcode_env := FCodeEnv
, folded_typed_ast := FoldedTypedAst
, warnings := Warnings } = string_to_code(ContractString, Options),
#{ child_con_env := ChildContracts } = FCodeEnv,
SavedFreshNames = maps:get(saved_fresh_names, FCodeEnv, #{}),
FateCode = so_fcode_to_fate:compile(ChildContracts, FCode, SavedFreshNames, Options),
pp_assembler(FateCode, Options),
ByteCode = gmb_fate_code:serialize(FateCode, []),
{ok, Version} = version(),
Res = #{byte_code => ByteCode,
compiler_version => Version,
contract_source => ContractString,
type_info => [],
fate_code => FateCode,
abi_version => gmb_fate_abi:abi_version(),
payable => maps:get(payable, FCode),
warnings => Warnings
},
{ok, maybe_generate_aci(Res, FoldedTypedAst, Options)}.
```
So a lot is going on in `string_to_code/2`
```erlang
-spec string_to_code(string(), options()) -> map().
string_to_code(ContractString, Options) ->
Ast = parse(ContractString, Options),
pp_sophia_code(Ast, Options),
pp_ast(Ast, Options),
{TypeEnv, FoldedTypedAst, UnfoldedTypedAst, Warnings} = so_ast_infer_types:infer(Ast, [return_env | Options]),
pp_typed_ast(UnfoldedTypedAst, Options),
{Env, Fcode} = so_ast_to_fcode:ast_to_fcode(UnfoldedTypedAst, [{original_src, ContractString}|Options]),
#{ fcode => Fcode
, fcode_env => Env
, unfolded_typed_ast => UnfoldedTypedAst
, folded_typed_ast => FoldedTypedAst
, type_env => TypeEnv
, ast => Ast
, warnings => Warnings }.
-spec parse(string(), so_compiler:options()) -> none() | so_syntax:ast().
parse(Text, Options) ->
parse(Text, sets:new(), Options).
-spec parse(string(), sets:set(), so_compiler:options()) -> none() | so_syntax:ast().
parse(Text, Included, Options) ->
so_parser:string(Text, Included, Options).
```
So we get an AST from `so_parser:string/3`
```
%% so_parser.erl
-spec string(string(), sets:set(include_hash()), so_compiler:options()) -> parse_result().
string(String, Included, Opts) ->
AST = run_parser(file(), String, Opts),
case expand_includes(AST, Included, Opts) of
{ok, AST1} -> AST1;
{error, Err} -> parse_error(Err)
end.
run_parser(P, Inp) ->
escape_errors(parse_and_scan(P, Inp, [])).
run_parser(P, Inp, Opts) ->
escape_errors(parse_and_scan(P, Inp, Opts)).
parse_and_scan(P, S, Opts) ->
set_current_file(proplists:get_value(src_file, Opts, no_file)),
set_current_dir(proplists:get_value(src_dir, Opts, no_file)),
set_current_include_type(proplists:get_value(include_type, Opts, none)),
case so_scan:scan(S) of
{ok, Tokens} -> so_parse_lib:parse(P, Tokens);
{error, {{Input, Pos}, _}} ->
{error, {Pos, scan_error, Input}}
end.
```
So there's a lot of metadata being kept, but the key part is the call to
`so_scan:scan/1`
```erl
lexer() ->
Number = fun(Digit) -> [Digit, "+(_", Digit, "+)*"] end,
DIGIT = "[0-9]",
HEXDIGIT = "[0-9a-fA-F]",
LOWER = "[a-z_]",
UPPER = "[A-Z]",
CON = [UPPER, "[a-zA-Z0-9_]*"],
INT = Number(DIGIT),
HEX = ["0x", Number(HEXDIGIT)],
BYTES = ["#", Number(HEXDIGIT)],
WS = "[\\000-\\ ]+",
ID = [LOWER, "[a-zA-Z0-9_']*"],
TVAR = ["'", ID],
QID = ["(", CON, "\\.)+", ID],
QCON = ["(", CON, "\\.)+", CON],
OP = "[=!<>+\\-*/:&|?~@^]+",
%% Five cases for a character
%% * 1 7-bit ascii, not \ or '
%% * 2-4 8-bit values (UTF8)
%% * \ followed by a known modifier [aernrtv]
%% * \xhh
%% * \x{hhh...}
CHAR = "'(([\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f])|([\\x00-\\xff][\\x80-\\xff]{1,3})|(\\\\[befnrtv'\\\\])|(\\\\x[0-9a-fA-F]{2,2})|(\\\\x\\{[0-9a-fA-F]*\\}))'",
STRING = "\"([^\"\\\\]|(\\\\.))*\"",
CommentStart = {"/\\*", push(comment, skip())},
CommentRules =
[ CommentStart
, {"\\*/", pop(skip())}
, {"[^/*]+|[/*]", skip()} ],
Keywords = ["contract", "include", "let", "switch", "type", "record", "datatype", "if", "elif", "else", "function",
"stateful", "payable", "true", "false", "mod", "public", "entrypoint", "private", "indexed", "namespace",
"interface", "main", "using", "as", "for", "hiding", "band", "bor", "bxor", "bnot"
],
KW = string:join(Keywords, "|"),
Rules =
%% Comments and whitespace
[ CommentStart
, {"//.*", skip()}
, {WS, skip()}
%% Special characters
, {"\\.\\.|[,.;()\\[\\]{}]", symbol()}
%% Literals
, {CHAR, token(char, fun parse_char/1)}
, {STRING, token(string, fun parse_string/1)}
, {HEX, token(hex, fun parse_hex/1)}
, {INT, token(int, fun parse_int/1)}
, {BYTES, token(bytes, fun parse_bytes/1)}
%% Identifiers (qualified first!)
, {QID, token(qid, fun(S) -> string:tokens(S, ".") end)}
, {QCON, token(qcon, fun(S) -> string:tokens(S, ".") end)}
, {TVAR, token(tvar)}
, override({ID, token(id)}, {KW, symbol()}) %% Keywords override identifiers. Need to
, {CON, token(con)} %% use override to avoid lexing "lettuce"
%% as ['let', {id, "tuce"}].
%% Operators
, {OP, symbol()}
],
[{code, Rules}, {comment, CommentRules}].
scan(String) ->
Lexer = so_scan_lib:compile(lexer()),
so_scan_lib:string(Lexer, code, String).
```
OK. let's look at `so_scan_lib`
```erl
-type regex() :: iodata() | unicode:charlist().
-type pos() :: {integer(), integer()}.
-type lex_state() :: atom().
-type token() :: {atom(), pos(), term()} | {atom(), pos()}.
-type token_spec() :: {regex(), token_action()}.
-opaque token_action() :: fun((string(), pos()) -> {tok_result(), state_change()}).
-opaque lexer() :: [{lex_state(),
fun((string(), pos()) -> {ok, tok_result(), string(), pos()}
| end_of_file | error)}].
%% -- Internal types --
-type tok_result() :: {token, token()} | skip.
-type state_change() :: none | pop | {push, lex_state()}.
%% @doc Compile a lexer specification. Takes the regexps for each state and
%% combines them into a single big regexp that is then compiled with re:compile/1.
%% Note: contrary to lexer generators like leex, we don't have longest match
%% semantics (since this isn't supported by re). Use override/2 instead.
-spec compile([{lex_state(), [token_spec()]}]) -> lexer().
compile(TokenSpecs) ->
[{S, compile_spec(Spec)} || {S, Spec} <- TokenSpecs].
compile_spec(TokenSpecs) ->
WithIxs = lists:zip(lists:seq(1, length(TokenSpecs)), TokenSpecs),
{ok, Regex} = re:compile(["^(", name(0), string:join([ ["(", name(I), R, ")"] || {I, {R, _}} <- WithIxs ], "|"),")"]),
Actions = [ Fun || {_, Fun} <- TokenSpecs ],
fun ("", _Pos) -> end_of_file;
(S, Pos) ->
case re:run(S, Regex, [{capture, all_names}]) of
{match, [{0, N} | Capture]} ->
Index = 1 + length(lists:takewhile(fun({P, _}) -> P == -1 end, Capture)),
Action = lists:nth(Index, Actions),
{TokS, Rest} = lists:split(N, S),
Tok = Action(TokS, Pos),
{ok, Tok, Rest, next_pos(TokS, Pos)};
nomatch ->
error
end
end.
```
# How does sophia compilation work
+2 -84
@@ -4,88 +4,6 @@ documenting for myself
## Sophia syntax highlighting ## Sophia syntax highlighting
todo. it's on github somewhere, not hard to find See: <https://github.com/yinkaenoch/sophia-vim-syntax>
## fuzzy finding plugin Read the link there and do the needful
this is annoying and requires like 10 minutes of setup.
BUT this is super helpful in huge repositories such as the node codebase
```
sudo apt install bat fd-find fzf ripgrep
```
(devuan excalibur)
ripgrep is optional, craig, but the vim plugin needs it if you want to search
for regexes *inside* files
say you're trying to quickly remember what the fuck `gmser_id:id()` is. Like is
that the record or is that the 33-byte tagged public key? I can't remember and
neither can you
![](./uploads/ripgrep-vim.png)
this saves you like 15 seconds and a bunch of context switching. each time
the plugin is super annoying to install but basically don't follow any of the
instructions in the repo. just clone the `fzf.vim` repo on github (google) to
`~/.vim/bundle/fzf.vim`.
you also need to tell vim to load the `.vim` file that ships with the package
```
[pharpend@picklet ioecs/GajuDesk master] % dpkg -L fzf
/.
/usr
/usr/bin
/usr/bin/fzf
/usr/bin/fzf-tmux
/usr/share
/usr/share/doc
/usr/share/doc/fzf
/usr/share/doc/fzf/README-VIM.md.gz
/usr/share/doc/fzf/README.Debian
/usr/share/doc/fzf/README.md.gz
/usr/share/doc/fzf/changelog.Debian.amd64.gz
/usr/share/doc/fzf/changelog.Debian.gz
/usr/share/doc/fzf/changelog.gz
/usr/share/doc/fzf/copyright
/usr/share/doc/fzf/examples
/usr/share/doc/fzf/examples/completion.bash
/usr/share/doc/fzf/examples/completion.zsh
/usr/share/doc/fzf/examples/fzf.vim
/usr/share/doc/fzf/examples/key-bindings.bash
/usr/share/doc/fzf/examples/key-bindings.fish
/usr/share/doc/fzf/examples/key-bindings.zsh
/usr/share/doc/fzf/examples/plugin
/usr/share/fish
/usr/share/fish/vendor_functions.d
/usr/share/fish/vendor_functions.d/fzf_key_bindings.fish
/usr/share/man
/usr/share/man/man1
/usr/share/man/man1/fzf-tmux.1.gz
/usr/share/man/man1/fzf.1.gz
/usr/share/doc/fzf/examples/plugin/fzf.vim
```
last file there. put that file at `~/.vim/autoload/fzf.vim`
should just work.
- `:Files` opens the fuzzy file finder
- `:Rg` is the interactive grep thing shown above
i have this vimrc:
```vim
let $FZF_DEFAULT_COMMAND = 'fdfind --type f'
noremap <C-e> :Files<CR>
noremap <C-r> :Rg<CR>
```
the fdfind thing means fuzzy find doesn't surface files in your .gitignore
(e.g. beam files, `_build` insanity)
will try and see
BIN
Binary file not shown.

After

Width:  |  Height:  |  Size: 43 KiB

BIN
Binary file not shown.

After

Width:  |  Height:  |  Size: 14 KiB

BIN
Binary file not shown.

After

Width:  |  Height:  |  Size: 6.0 KiB

BIN
Binary file not shown.

Before

Width:  |  Height:  |  Size: 262 KiB

BIN
Binary file not shown.

After

Width:  |  Height:  |  Size: 19 KiB