diff --git a/Home.md b/Home.md index 07ba5e0..7f9f64a 100644 --- a/Home.md +++ b/Home.md @@ -26,7 +26,7 @@ Title | Brief Description [[Serializations]] | Conventions for field order in Gajumaru data structures [[Smart Contracts]] | Terminology [[Sophia]] | Introduction to Sophia, the Gajumaru smart contract language -[[Sophia FAQ]] | what it says +[[Sophia FQA]] | what it says [[State Channels]] | Overview and characteristics [[Testnet Node Setup]] | Tech support [[Transaction]] | Terminology diff --git a/Sophia-FAQ.md b/Sophia-FAQ.md deleted file mode 100644 index 3fc44e0..0000000 --- a/Sophia-FAQ.md +++ /dev/null @@ -1,259 +0,0 @@ -# Sophia FAQ - -- Created: 2026-03-30 -- Authors: Peter Harpending `` -- Last Modified: 2026-04-07 - -# References - -- [Sophia docs](https://git.qpq.swiss/QPQ-AG/sophia/src/branch/master/docs) -- [Protocol docs](https://git.qpq.swiss/QPQ-AG/protocol) - -# Defining Events in interfaces - -apparently this is legal syntax but the point of this is unclear. - -# Can there be the same function name with different arities? - -# What happens if you delete a non-existent key from a map? - -# How does sophia compilation work - - -From commit `dbab49936daad7d82bae7cf7336b1ce82e7ab779` - -```erlang -% so_compiler.erl:84 --spec file(string()) -> {ok, map()} | {error, [so_errors:error()]}. -file(Filename) -> - file(Filename, []). - --spec file(string(), options()) -> {ok, map()} | {error, [so_errors:error()]}. -file(File, Options0) -> - Options = add_include_path(File, Options0), - case read_contract(File) of - {ok, Bin} -> - SrcDir = so_utils:canonical_dir(filename:dirname(File)), - from_string(Bin, [{src_file, File}, {src_dir, SrcDir} | Options]); - {error, Error} -> - Msg = lists:flatten([File,": ",file:format_error(Error)]), - {error, [so_errors:new(file_error, Msg)]} - end. - --spec from_string(binary() | string(), options()) -> {ok, map()} | {error, [so_errors:error()]}. -from_string(ContractBin, Options) when is_binary(ContractBin) -> - from_string(binary_to_list(ContractBin), Options); -from_string(ContractString, Options) -> - try - from_string1(ContractString, Options) - catch - throw:{error, Errors} -> {error, Errors} - end. - -from_string1(ContractString, Options) -> - #{ fcode := FCode - , fcode_env := FCodeEnv - , folded_typed_ast := FoldedTypedAst - , warnings := Warnings } = string_to_code(ContractString, Options), - #{ child_con_env := ChildContracts } = FCodeEnv, - SavedFreshNames = maps:get(saved_fresh_names, FCodeEnv, #{}), - FateCode = so_fcode_to_fate:compile(ChildContracts, FCode, SavedFreshNames, Options), - pp_assembler(FateCode, Options), - ByteCode = gmb_fate_code:serialize(FateCode, []), - {ok, Version} = version(), - Res = #{byte_code => ByteCode, - compiler_version => Version, - contract_source => ContractString, - type_info => [], - fate_code => FateCode, - abi_version => gmb_fate_abi:abi_version(), - payable => maps:get(payable, FCode), - warnings => Warnings - }, - {ok, maybe_generate_aci(Res, FoldedTypedAst, Options)}. - -``` - -So a lot is going on in `string_to_code/2` - -```erlang --spec string_to_code(string(), options()) -> map(). -string_to_code(ContractString, Options) -> - Ast = parse(ContractString, Options), - pp_sophia_code(Ast, Options), - pp_ast(Ast, Options), - {TypeEnv, FoldedTypedAst, UnfoldedTypedAst, Warnings} = so_ast_infer_types:infer(Ast, [return_env | Options]), - pp_typed_ast(UnfoldedTypedAst, Options), - {Env, Fcode} = so_ast_to_fcode:ast_to_fcode(UnfoldedTypedAst, [{original_src, ContractString}|Options]), - #{ fcode => Fcode - , fcode_env => Env - , unfolded_typed_ast => UnfoldedTypedAst - , folded_typed_ast => FoldedTypedAst - , type_env => TypeEnv - , ast => Ast - , warnings => Warnings }. - - --spec parse(string(), so_compiler:options()) -> none() | so_syntax:ast(). -parse(Text, Options) -> - parse(Text, sets:new(), Options). - --spec parse(string(), sets:set(), so_compiler:options()) -> none() | so_syntax:ast(). -parse(Text, Included, Options) -> - so_parser:string(Text, Included, Options). -``` - -So we get an AST from `so_parser:string/3` - -``` -%% so_parser.erl --spec string(string(), sets:set(include_hash()), so_compiler:options()) -> parse_result(). -string(String, Included, Opts) -> - AST = run_parser(file(), String, Opts), - case expand_includes(AST, Included, Opts) of - {ok, AST1} -> AST1; - {error, Err} -> parse_error(Err) - end. - - -run_parser(P, Inp) -> - escape_errors(parse_and_scan(P, Inp, [])). -run_parser(P, Inp, Opts) -> - escape_errors(parse_and_scan(P, Inp, Opts)). - -parse_and_scan(P, S, Opts) -> - set_current_file(proplists:get_value(src_file, Opts, no_file)), - set_current_dir(proplists:get_value(src_dir, Opts, no_file)), - set_current_include_type(proplists:get_value(include_type, Opts, none)), - case so_scan:scan(S) of - {ok, Tokens} -> so_parse_lib:parse(P, Tokens); - {error, {{Input, Pos}, _}} -> - {error, {Pos, scan_error, Input}} - end. - -``` - -So there's a lot of metadata being kept, but the key part is the call to -`so_scan:scan/1` - -```erl -lexer() -> - Number = fun(Digit) -> [Digit, "+(_", Digit, "+)*"] end, - DIGIT = "[0-9]", - HEXDIGIT = "[0-9a-fA-F]", - LOWER = "[a-z_]", - UPPER = "[A-Z]", - CON = [UPPER, "[a-zA-Z0-9_]*"], - INT = Number(DIGIT), - HEX = ["0x", Number(HEXDIGIT)], - BYTES = ["#", Number(HEXDIGIT)], - WS = "[\\000-\\ ]+", - ID = [LOWER, "[a-zA-Z0-9_']*"], - TVAR = ["'", ID], - QID = ["(", CON, "\\.)+", ID], - QCON = ["(", CON, "\\.)+", CON], - OP = "[=!<>+\\-*/:&|?~@^]+", - %% Five cases for a character - %% * 1 7-bit ascii, not \ or ' - %% * 2-4 8-bit values (UTF8) - %% * \ followed by a known modifier [aernrtv] - %% * \xhh - %% * \x{hhh...} - CHAR = "'(([\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f])|([\\x00-\\xff][\\x80-\\xff]{1,3})|(\\\\[befnrtv'\\\\])|(\\\\x[0-9a-fA-F]{2,2})|(\\\\x\\{[0-9a-fA-F]*\\}))'", - STRING = "\"([^\"\\\\]|(\\\\.))*\"", - - CommentStart = {"/\\*", push(comment, skip())}, - CommentRules = - [ CommentStart - , {"\\*/", pop(skip())} - , {"[^/*]+|[/*]", skip()} ], - - Keywords = ["contract", "include", "let", "switch", "type", "record", "datatype", "if", "elif", "else", "function", - "stateful", "payable", "true", "false", "mod", "public", "entrypoint", "private", "indexed", "namespace", - "interface", "main", "using", "as", "for", "hiding", "band", "bor", "bxor", "bnot" - ], - KW = string:join(Keywords, "|"), - - Rules = - %% Comments and whitespace - [ CommentStart - , {"//.*", skip()} - , {WS, skip()} - - %% Special characters - , {"\\.\\.|[,.;()\\[\\]{}]", symbol()} - - %% Literals - , {CHAR, token(char, fun parse_char/1)} - , {STRING, token(string, fun parse_string/1)} - , {HEX, token(hex, fun parse_hex/1)} - , {INT, token(int, fun parse_int/1)} - , {BYTES, token(bytes, fun parse_bytes/1)} - - %% Identifiers (qualified first!) - , {QID, token(qid, fun(S) -> string:tokens(S, ".") end)} - , {QCON, token(qcon, fun(S) -> string:tokens(S, ".") end)} - , {TVAR, token(tvar)} - , override({ID, token(id)}, {KW, symbol()}) %% Keywords override identifiers. Need to - , {CON, token(con)} %% use override to avoid lexing "lettuce" - %% as ['let', {id, "tuce"}]. - %% Operators - , {OP, symbol()} - ], - - [{code, Rules}, {comment, CommentRules}]. - -scan(String) -> - Lexer = so_scan_lib:compile(lexer()), - so_scan_lib:string(Lexer, code, String). -``` - -OK. let's look at `so_scan_lib` - -```erl --type regex() :: iodata() | unicode:charlist(). --type pos() :: {integer(), integer()}. --type lex_state() :: atom(). --type token() :: {atom(), pos(), term()} | {atom(), pos()}. - --type token_spec() :: {regex(), token_action()}. --opaque token_action() :: fun((string(), pos()) -> {tok_result(), state_change()}). - --opaque lexer() :: [{lex_state(), - fun((string(), pos()) -> {ok, tok_result(), string(), pos()} - | end_of_file | error)}]. - -%% -- Internal types -- --type tok_result() :: {token, token()} | skip. --type state_change() :: none | pop | {push, lex_state()}. - -%% @doc Compile a lexer specification. Takes the regexps for each state and -%% combines them into a single big regexp that is then compiled with re:compile/1. -%% Note: contrary to lexer generators like leex, we don't have longest match -%% semantics (since this isn't supported by re). Use override/2 instead. --spec compile([{lex_state(), [token_spec()]}]) -> lexer(). -compile(TokenSpecs) -> - [{S, compile_spec(Spec)} || {S, Spec} <- TokenSpecs]. - -compile_spec(TokenSpecs) -> - WithIxs = lists:zip(lists:seq(1, length(TokenSpecs)), TokenSpecs), - {ok, Regex} = re:compile(["^(", name(0), string:join([ ["(", name(I), R, ")"] || {I, {R, _}} <- WithIxs ], "|"),")"]), - Actions = [ Fun || {_, Fun} <- TokenSpecs ], - fun ("", _Pos) -> end_of_file; - (S, Pos) -> - case re:run(S, Regex, [{capture, all_names}]) of - {match, [{0, N} | Capture]} -> - Index = 1 + length(lists:takewhile(fun({P, _}) -> P == -1 end, Capture)), - Action = lists:nth(Index, Actions), - {TokS, Rest} = lists:split(N, S), - Tok = Action(TokS, Pos), - {ok, Tok, Rest, next_pos(TokS, Pos)}; - nomatch -> - error - end - end. -``` - - - -# How does sophia compilation work diff --git a/Sophia-FQA.md b/Sophia-FQA.md new file mode 100644 index 0000000..9e262d9 --- /dev/null +++ b/Sophia-FQA.md @@ -0,0 +1,569 @@ +# Sophia Frequently Questioned Answers + +- Created: 2026-03-30 +- Authors: Peter Harpending `` +- Last Modified: 2026-06-04 + +# References + +- [Sophia Compiler][so] +- [Sophia docs][so-docs] +- [Protocol docs](https://git.qpq.swiss/QPQ-AG/protocol) +- [GSC][gsc] +- [GSC Token definition](https://git.qpq.swiss/QPQ-AG/gsc/src/commit/ba70aace96ed73138496744f7d90c2666428eafc/include/gsc.hrl#L45-L50) + +[gsc]: https://git.qpq.swiss/QPQ-AG/gsc +[so]: https://git.qpq.swiss/QPQ-AG/sophia +[so/docs]: https://git.qpq.swiss/QPQ-AG/sophia/src/branch/master/docs + +# GSC + +[GSC (= "gajumaru sophia compiler")][gsc] is an experimental +work-in-progress maybe-will-be-finished-maybe-won't Sophia compiler +that I (PRH) wrote in an effort to experiment with and document the +Sophia language. + +It is used a lot to illustrate things in this document, so you might +want to download it and get it to work on your machine. The goal in +version 0.1 is to match the exact behavior of the [legacy Sophia +compiler][so]. + +I wrote gsc mostly because I got nerdsniped by the problems that gsc +must solve in order to work. In the interest of retroactively +justifying being nerdsniped, I will note that the legacy compiler + +1. is *extremely* central to Gajumaru's trust model +2. has many serious-to-semi-serious bugs/warts/edge-cases which + (prior to this work) were either unknown or + known-but-not-documented; e.g., unterminated block comments at + the end of files are admissible provided what precedes is a valid + Sophia contract: + + ```sophia + contract Test = + type state = unit + entrypoint init() : state = + () + /* + according to the legacy sophia compiler, this is a totally 100% + legal sophia contract that ends with an unterminated block + comment + ``` + + +# Architecture of the Sophia Compiler + +First some disclaimers: + +1. **Compilers are _NOT_ magic incomprehensible black boxes** that + are totally inaccessible to ordinary programmers. (If you + encounter one that is, that says more about the compiler and its + authors than it does about you...). Compilers simply translate a + well-specified input format into a well-specified output format. + + **Compilers are just ordinary pieces of software that work the + same way every other piece of software does.** + +2. Like all other types of software, **compilers have bugs and + strange unexpected corner cases**. A (the?) purpose of this + document is to write down all such cases that I have encountered + thus far in this nerdsnipe adventure. + +Most compilers have some variation of the following architecture: + +1. **Tokenization** (also called **lexical analysis**); this step + takes the flat array of input characters found in the source code + and discovers the "chunk boundaries" in the file: + + ![](./uploads/tokens-c.png) + + Each chunk is called a "token". + +2. **Parsing** (also called **syntax analysis**); this step takes + the flat sequence of tokens, and arranges it into a hierarchy + (usually called an "abstract syntax tree" or "AST"). + + The set of rules regarding how the signal is transformed into the + abstract syntax tree is called the **grammar** of the language. + + ``` + source: + the quick brown fox jumps over the lazy dog + signal: + ["the", "quick", "brown", "fox", "jumps", + "over", "the", "lazy", "dog"] + ast: + (Sentence + (NounPhrase + (determiner "the") + (adjective "quick") + (adjective "brown") + (noun "fox")) + (VerbPhrase + (verb "jumps") + (PrepositionalPhrase + (preposition "over") + (NounPhrase + (determiner "the") + (adjective "lazy") + (noun "dog"))))) + ``` + + This is the first step in which we think of a language in terms + of its **structure** rather than simply being a sequence of + words/tokens. + +3. **Semantic analysis**: the compiler transforms the abstract + syntax tree through a sequence of **intermediate + representations** (**IR**s). + + This is where compiler engineering gets interesting, and factors + like artistic choice and taste start to dominate. Different + optimizations occur at different levels of intermediate + represntation. The structure of this meta-step depends heavily on + the source and target languages, problem domains, goals of the + specific compiler, etc. + + This is the step in which we think of phrases in the language in + terms of their **meaning** rather than in some strict notion of + valid vs. invalid. + +4. **Code generation**: once the compiler has completed its analysis + of the input data, and figured out in some precise way what the + author of the input was attempting to express notionally, it's + finally time to express said notion in the target language. + +At the time of writing (June 2026), only GSC's tokenizer has been +fully ironed out and thoroughly tested, the discussion of which will +constitute the remainder of this document. + +# PITFALL WARNING! TERMINOLOGY COLLISION re "tokens" vs. gsc "signal" + +What most compilers call "tokens", gsc calls "signal". + +GSC classifies tokens into "signal" and "noise"; "noise" means +comments and whitespace, and "signal" is everything else. + +Most compilers discard "noise" tokens (comments and whitespace). GSC +retains them for two reasons: + +1. sanity-checking to make sure information isn't lost on accident; + e.g. one of gsc's tests +2. future-proofing in case we want to add Python/Lisp +style doc comments as a language feature down the line. + +```python +def foo(): + "this is a doc comment for foo" + print("hi from foo") +``` + +![](./python-doc1.png) + +![](./python-doc2.png) + +However for non-bikeshed compiler tasks (figuring out what the code +is supposed to *do* and then expressing that in the target language), +noise tokens are entirely irrelevant. + +# What is a token? + +Tokens are the "chunk boundaries" of source files. + +![](./uploads/tokens-c.png) + +This is roughly analogous to "word boundaries" in natural language; +we can hack together a string-splitting function in the Erlang shell +to illustrate the notion: + +```erlang +11> Intersperse = fun I([], _Sep) -> []; I([Last], _Sep) -> [Last]; I([One | More], Sep) -> [One, Sep | I(More, Sep)] end. +#Fun +12> Intersperse(["foo", "bar", "baz"], " "). +["foo"," ","bar"," ","baz"] +13> string:tokens("foo bar baz", " "). +["foo","bar","baz"] +14> TokensEn = fun(SrcStr) -> Sep = " ", Signal = string:tokens(SrcStr, Sep), Tokens = Intersperse(Signal, Sep), Tokens end. +#Fun +15> TokensEn("foo bar baz"). +["foo"," ","bar"," ","baz"] +16> TokensEn("The quick brown fox jumped over the lazy dog"). +["The"," ","quick"," ","brown"," ","fox"," ","jumped"," ", + "over"," ","the"," ","lazy"," ","dog"] +``` + +You can see the pitfall regarding termionology collision present in +the behavior of the Erlang standard library `string:tokens/2` +function, which discards the separator characters: + +```erlang +17> string:tokens("foo.bar.baz", "."). +["foo","bar","baz"] +18> string:tokens("foo.bar,baz", ","). +["foo.bar","baz"] +``` + +# Sophia Tokens + +```erlang +-type tk_shape() + :: bcom % /* ... */ + | lcom % // + | ws % whitespace + % literals + | char % 'a' + | string % "foo" + | int10 % 69_420 + | int16 % 0xDEAD_BEEF + | bytes % #DEAD_BEEF + | ak % ak_ABC + | ct % ct_ABC + | sg % sg_ABC + % kwds/variables/etc + | id % foo, foo_bar, foo_bar'baz' _'foo' + | con % Foo, Foo_Bar, FooBar + | qid % Foo.Bar.baz + | qcon % Foo.Bar.Baz + | tvar % 'foo, 'foo_bar, '_'foo'_'bar''' + % kwds ops and sep are all collapsed by + % so_scan:scan down to eg {'contract', {420, 69}} + % where {420, 69} is the source location + % these are three different parsers + | kwd % contract, interface, payable, etc + | op % "=!<>+-*/:&|?~@^" + | sep % ".." | oneof(",.;()[]{}") + % kwds and sep are kind of the same thing + % but i'll keep them separate now for my own sanity. ok + % i guess op or symbol or whatever is fine. + % + % not going to overthink. if having them separate + % becomes an issue it's easy enough to collapse. harder + % to separate afterward if collapsing is wrong. + . + +-type tk_pos() :: {Line :: pos_integer(), Col :: pos_integer()}. + +-record(tk, + {shape :: tk_shape(), + pos :: tk_pos(), + str :: string()}). + +-type tk() :: #tk{}. +``` + +Concretely: + +```sophia +// Hello World Contract +// Copyright (c) 2025 QPQ AG + +contract Hello = + type state = unit + entrypoint init(): state = + () + + entrypoint hello(): string = + "hello, world" +``` + +![](./uploads/tokens-c.png) + +```erlang +[pharpend@desktop ioecs/gsc master] % gsc tokens test/ct/hello.aes +{tk,lcom,{1,1},"// Hello World Contract"} +{tk,ws,{1,24},"\n"} +{tk,lcom,{2,1},"// Copyright (c) 2025 QPQ AG"} +{tk,ws,{2,29},"\n\n"} +{tk,kwd,{4,1},"contract"} +{tk,ws,{4,9}," "} +{tk,con,{4,10},"Hello"} +{tk,ws,{4,15}," "} +{tk,op,{4,16},"="} +{tk,ws,{4,17},"\n "} +{tk,kwd,{5,5},"type"} +{tk,ws,{5,9}," "} +{tk,id,{5,10},"state"} +{tk,ws,{5,15}," "} +{tk,op,{5,16},"="} +{tk,ws,{5,17}," "} +{tk,id,{5,18},"unit"} +{tk,ws,{5,22},"\n "} +{tk,kwd,{6,5},"entrypoint"} +{tk,ws,{6,15}," "} +{tk,id,{6,16},"init"} +{tk,sep,{6,20},"("} +{tk,sep,{6,21},")"} +{tk,op,{6,22},":"} +{tk,ws,{6,23}," "} +{tk,id,{6,24},"state"} +{tk,ws,{6,29}," "} +{tk,op,{6,30},"="} +{tk,ws,{6,31},"\n "} +{tk,sep,{7,9},"("} +{tk,sep,{7,10},")"} +{tk,ws,{7,11},"\n\n "} +{tk,kwd,{9,5},"entrypoint"} +{tk,ws,{9,15}," "} +{tk,id,{9,16},"hello"} +{tk,sep,{9,21},"("} +{tk,sep,{9,22},")"} +{tk,op,{9,23},":"} +{tk,ws,{9,24}," "} +{tk,id,{9,25},"string"} +{tk,ws,{9,31}," "} +{tk,op,{9,32},"="} +{tk,ws,{9,33},"\n "} +{tk,string,{10,9},"\"hello, world\""} +{tk,ws,{10,23},"\n"} +``` + + + +# Defining Events in interfaces + +apparently this is legal syntax but the point of this is unclear. + +# Can there be the same function name with different arities? + +# What happens if you delete a non-existent key from a map? + +# How does sophia compilation work + + +From commit `dbab49936daad7d82bae7cf7336b1ce82e7ab779` + +```erlang +% so_compiler.erl:84 +-spec file(string()) -> {ok, map()} | {error, [so_errors:error()]}. +file(Filename) -> + file(Filename, []). + +-spec file(string(), options()) -> {ok, map()} | {error, [so_errors:error()]}. +file(File, Options0) -> + Options = add_include_path(File, Options0), + case read_contract(File) of + {ok, Bin} -> + SrcDir = so_utils:canonical_dir(filename:dirname(File)), + from_string(Bin, [{src_file, File}, {src_dir, SrcDir} | Options]); + {error, Error} -> + Msg = lists:flatten([File,": ",file:format_error(Error)]), + {error, [so_errors:new(file_error, Msg)]} + end. + +-spec from_string(binary() | string(), options()) -> {ok, map()} | {error, [so_errors:error()]}. +from_string(ContractBin, Options) when is_binary(ContractBin) -> + from_string(binary_to_list(ContractBin), Options); +from_string(ContractString, Options) -> + try + from_string1(ContractString, Options) + catch + throw:{error, Errors} -> {error, Errors} + end. + +from_string1(ContractString, Options) -> + #{ fcode := FCode + , fcode_env := FCodeEnv + , folded_typed_ast := FoldedTypedAst + , warnings := Warnings } = string_to_code(ContractString, Options), + #{ child_con_env := ChildContracts } = FCodeEnv, + SavedFreshNames = maps:get(saved_fresh_names, FCodeEnv, #{}), + FateCode = so_fcode_to_fate:compile(ChildContracts, FCode, SavedFreshNames, Options), + pp_assembler(FateCode, Options), + ByteCode = gmb_fate_code:serialize(FateCode, []), + {ok, Version} = version(), + Res = #{byte_code => ByteCode, + compiler_version => Version, + contract_source => ContractString, + type_info => [], + fate_code => FateCode, + abi_version => gmb_fate_abi:abi_version(), + payable => maps:get(payable, FCode), + warnings => Warnings + }, + {ok, maybe_generate_aci(Res, FoldedTypedAst, Options)}. + +``` + +So a lot is going on in `string_to_code/2` + +```erlang +-spec string_to_code(string(), options()) -> map(). +string_to_code(ContractString, Options) -> + Ast = parse(ContractString, Options), + pp_sophia_code(Ast, Options), + pp_ast(Ast, Options), + {TypeEnv, FoldedTypedAst, UnfoldedTypedAst, Warnings} = so_ast_infer_types:infer(Ast, [return_env | Options]), + pp_typed_ast(UnfoldedTypedAst, Options), + {Env, Fcode} = so_ast_to_fcode:ast_to_fcode(UnfoldedTypedAst, [{original_src, ContractString}|Options]), + #{ fcode => Fcode + , fcode_env => Env + , unfolded_typed_ast => UnfoldedTypedAst + , folded_typed_ast => FoldedTypedAst + , type_env => TypeEnv + , ast => Ast + , warnings => Warnings }. + + +-spec parse(string(), so_compiler:options()) -> none() | so_syntax:ast(). +parse(Text, Options) -> + parse(Text, sets:new(), Options). + +-spec parse(string(), sets:set(), so_compiler:options()) -> none() | so_syntax:ast(). +parse(Text, Included, Options) -> + so_parser:string(Text, Included, Options). +``` + +So we get an AST from `so_parser:string/3` + +``` +%% so_parser.erl +-spec string(string(), sets:set(include_hash()), so_compiler:options()) -> parse_result(). +string(String, Included, Opts) -> + AST = run_parser(file(), String, Opts), + case expand_includes(AST, Included, Opts) of + {ok, AST1} -> AST1; + {error, Err} -> parse_error(Err) + end. + + +run_parser(P, Inp) -> + escape_errors(parse_and_scan(P, Inp, [])). +run_parser(P, Inp, Opts) -> + escape_errors(parse_and_scan(P, Inp, Opts)). + +parse_and_scan(P, S, Opts) -> + set_current_file(proplists:get_value(src_file, Opts, no_file)), + set_current_dir(proplists:get_value(src_dir, Opts, no_file)), + set_current_include_type(proplists:get_value(include_type, Opts, none)), + case so_scan:scan(S) of + {ok, Tokens} -> so_parse_lib:parse(P, Tokens); + {error, {{Input, Pos}, _}} -> + {error, {Pos, scan_error, Input}} + end. + +``` + +So there's a lot of metadata being kept, but the key part is the call to +`so_scan:scan/1` + +```erl +lexer() -> + Number = fun(Digit) -> [Digit, "+(_", Digit, "+)*"] end, + DIGIT = "[0-9]", + HEXDIGIT = "[0-9a-fA-F]", + LOWER = "[a-z_]", + UPPER = "[A-Z]", + CON = [UPPER, "[a-zA-Z0-9_]*"], + INT = Number(DIGIT), + HEX = ["0x", Number(HEXDIGIT)], + BYTES = ["#", Number(HEXDIGIT)], + WS = "[\\000-\\ ]+", + ID = [LOWER, "[a-zA-Z0-9_']*"], + TVAR = ["'", ID], + QID = ["(", CON, "\\.)+", ID], + QCON = ["(", CON, "\\.)+", CON], + OP = "[=!<>+\\-*/:&|?~@^]+", + %% Five cases for a character + %% * 1 7-bit ascii, not \ or ' + %% * 2-4 8-bit values (UTF8) + %% * \ followed by a known modifier [aernrtv] + %% * \xhh + %% * \x{hhh...} + CHAR = "'(([\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f])|([\\x00-\\xff][\\x80-\\xff]{1,3})|(\\\\[befnrtv'\\\\])|(\\\\x[0-9a-fA-F]{2,2})|(\\\\x\\{[0-9a-fA-F]*\\}))'", + STRING = "\"([^\"\\\\]|(\\\\.))*\"", + + CommentStart = {"/\\*", push(comment, skip())}, + CommentRules = + [ CommentStart + , {"\\*/", pop(skip())} + , {"[^/*]+|[/*]", skip()} ], + + Keywords = ["contract", "include", "let", "switch", "type", "record", "datatype", "if", "elif", "else", "function", + "stateful", "payable", "true", "false", "mod", "public", "entrypoint", "private", "indexed", "namespace", + "interface", "main", "using", "as", "for", "hiding", "band", "bor", "bxor", "bnot" + ], + KW = string:join(Keywords, "|"), + + Rules = + %% Comments and whitespace + [ CommentStart + , {"//.*", skip()} + , {WS, skip()} + + %% Special characters + , {"\\.\\.|[,.;()\\[\\]{}]", symbol()} + + %% Literals + , {CHAR, token(char, fun parse_char/1)} + , {STRING, token(string, fun parse_string/1)} + , {HEX, token(hex, fun parse_hex/1)} + , {INT, token(int, fun parse_int/1)} + , {BYTES, token(bytes, fun parse_bytes/1)} + + %% Identifiers (qualified first!) + , {QID, token(qid, fun(S) -> string:tokens(S, ".") end)} + , {QCON, token(qcon, fun(S) -> string:tokens(S, ".") end)} + , {TVAR, token(tvar)} + , override({ID, token(id)}, {KW, symbol()}) %% Keywords override identifiers. Need to + , {CON, token(con)} %% use override to avoid lexing "lettuce" + %% as ['let', {id, "tuce"}]. + %% Operators + , {OP, symbol()} + ], + + [{code, Rules}, {comment, CommentRules}]. + +scan(String) -> + Lexer = so_scan_lib:compile(lexer()), + so_scan_lib:string(Lexer, code, String). +``` + +OK. let's look at `so_scan_lib` + +```erl +-type regex() :: iodata() | unicode:charlist(). +-type pos() :: {integer(), integer()}. +-type lex_state() :: atom(). +-type token() :: {atom(), pos(), term()} | {atom(), pos()}. + +-type token_spec() :: {regex(), token_action()}. +-opaque token_action() :: fun((string(), pos()) -> {tok_result(), state_change()}). + +-opaque lexer() :: [{lex_state(), + fun((string(), pos()) -> {ok, tok_result(), string(), pos()} + | end_of_file | error)}]. + +%% -- Internal types -- +-type tok_result() :: {token, token()} | skip. +-type state_change() :: none | pop | {push, lex_state()}. + +%% @doc Compile a lexer specification. Takes the regexps for each state and +%% combines them into a single big regexp that is then compiled with re:compile/1. +%% Note: contrary to lexer generators like leex, we don't have longest match +%% semantics (since this isn't supported by re). Use override/2 instead. +-spec compile([{lex_state(), [token_spec()]}]) -> lexer(). +compile(TokenSpecs) -> + [{S, compile_spec(Spec)} || {S, Spec} <- TokenSpecs]. + +compile_spec(TokenSpecs) -> + WithIxs = lists:zip(lists:seq(1, length(TokenSpecs)), TokenSpecs), + {ok, Regex} = re:compile(["^(", name(0), string:join([ ["(", name(I), R, ")"] || {I, {R, _}} <- WithIxs ], "|"),")"]), + Actions = [ Fun || {_, Fun} <- TokenSpecs ], + fun ("", _Pos) -> end_of_file; + (S, Pos) -> + case re:run(S, Regex, [{capture, all_names}]) of + {match, [{0, N} | Capture]} -> + Index = 1 + length(lists:takewhile(fun({P, _}) -> P == -1 end, Capture)), + Action = lists:nth(Index, Actions), + {TokS, Rest} = lists:split(N, S), + Tok = Action(TokS, Pos), + {ok, Tok, Rest, next_pos(TokS, Pos)}; + nomatch -> + error + end + end. +``` + + + +# How does sophia compilation work diff --git a/Vim-Setup.md b/Vim-Setup.md index 88ded9a..8c7d2dc 100644 --- a/Vim-Setup.md +++ b/Vim-Setup.md @@ -4,88 +4,6 @@ documenting for myself ## Sophia syntax highlighting -todo. it's on github somewhere, not hard to find +See: -## fuzzy finding plugin - -this is annoying and requires like 10 minutes of setup. - -BUT this is super helpful in huge repositories such as the node codebase - -``` -sudo apt install bat fd-find fzf ripgrep -``` - -(devuan excalibur) - -ripgrep is optional, craig, but the vim plugin needs it if you want to search -for regexes *inside* files - -say you're trying to quickly remember what the fuck `gmser_id:id()` is. Like is -that the record or is that the 33-byte tagged public key? I can't remember and -neither can you - -![](./uploads/ripgrep-vim.png) - -this saves you like 15 seconds and a bunch of context switching. each time - -the plugin is super annoying to install but basically don't follow any of the -instructions in the repo. just clone the `fzf.vim` repo on github (google) to -`~/.vim/bundle/fzf.vim`. - -you also need to tell vim to load the `.vim` file that ships with the package - -``` -[pharpend@picklet ioecs/GajuDesk master] % dpkg -L fzf -/. -/usr -/usr/bin -/usr/bin/fzf -/usr/bin/fzf-tmux -/usr/share -/usr/share/doc -/usr/share/doc/fzf -/usr/share/doc/fzf/README-VIM.md.gz -/usr/share/doc/fzf/README.Debian -/usr/share/doc/fzf/README.md.gz -/usr/share/doc/fzf/changelog.Debian.amd64.gz -/usr/share/doc/fzf/changelog.Debian.gz -/usr/share/doc/fzf/changelog.gz -/usr/share/doc/fzf/copyright -/usr/share/doc/fzf/examples -/usr/share/doc/fzf/examples/completion.bash -/usr/share/doc/fzf/examples/completion.zsh -/usr/share/doc/fzf/examples/fzf.vim -/usr/share/doc/fzf/examples/key-bindings.bash -/usr/share/doc/fzf/examples/key-bindings.fish -/usr/share/doc/fzf/examples/key-bindings.zsh -/usr/share/doc/fzf/examples/plugin -/usr/share/fish -/usr/share/fish/vendor_functions.d -/usr/share/fish/vendor_functions.d/fzf_key_bindings.fish -/usr/share/man -/usr/share/man/man1 -/usr/share/man/man1/fzf-tmux.1.gz -/usr/share/man/man1/fzf.1.gz -/usr/share/doc/fzf/examples/plugin/fzf.vim -``` - -last file there. put that file at `~/.vim/autoload/fzf.vim` - -should just work. - -- `:Files` opens the fuzzy file finder -- `:Rg` is the interactive grep thing shown above - -i have this vimrc: - -```vim -let $FZF_DEFAULT_COMMAND = 'fdfind --type f' -noremap :Files -noremap :Rg -``` - -the fdfind thing means fuzzy find doesn't surface files in your .gitignore -(e.g. beam files, `_build` insanity) - -will try and see +Read the link there and do the needful diff --git a/uploads/TokensEn.png b/uploads/TokensEn.png new file mode 100644 index 0000000..67db8ff Binary files /dev/null and b/uploads/TokensEn.png differ diff --git a/uploads/python-doc1.png b/uploads/python-doc1.png new file mode 100644 index 0000000..2ea0007 Binary files /dev/null and b/uploads/python-doc1.png differ diff --git a/uploads/python-doc2.png b/uploads/python-doc2.png new file mode 100644 index 0000000..49cc387 Binary files /dev/null and b/uploads/python-doc2.png differ diff --git a/uploads/ripgrep-vim.png b/uploads/ripgrep-vim.png deleted file mode 100644 index fea2420..0000000 Binary files a/uploads/ripgrep-vim.png and /dev/null differ diff --git a/uploads/tokens-c.png b/uploads/tokens-c.png new file mode 100644 index 0000000..f15120c Binary files /dev/null and b/uploads/tokens-c.png differ