Compare commits

..

9 Commits

Author SHA1 Message Date
Peter Harpending f04b7311f5 stuff 2026-06-05 00:58:53 -07:00
pharpend 10424927b1 stuff 2026-06-04 14:01:46 -07:00
pharpend fdb40dcb92 stuff 2026-06-04 11:42:48 -07:00
Peter Harpending e180dc955d stuff 2026-06-03 19:28:55 -07:00
Peter Harpending 4e54bebeba parens work... moving on to documenting work 2026-06-03 15:17:55 -07:00
Peter Harpending 4f4adaa284 stopping point 2026-06-02 16:51:05 -07:00
Peter Harpending 2c36a02331 all the old tests pass... moving on 2026-06-02 12:48:41 -07:00
Peter Harpending 5cae022b8b Merge remote-tracking branch 'refs/remotes/origin/master' 2026-06-02 11:04:54 -07:00
Peter Harpending dfb158e593 unicode 2026-06-02 11:04:34 -07:00
20 changed files with 1433 additions and 96 deletions
+12 -7
View File
@@ -1,10 +1,15 @@
# TODONE
# TODO
- barf for outputs, slurp for inputs
- architecture needs more careful thought but only after something works
- architecture needs more careful thought but only after something
works
- too fuzzy right now
- possibly:
- rename parser layers sequentially:
- gsc_
- undo gs_ naming fuckery.. everything is `gsc_*`. it's just
needlessly confusing. for now let's name new things gsc_* and then
go back and undo the stupidity
# TONOTDO
- barf for outputs, slurp for inputs
- rename parser layers sequentially
# TODONE
+77
View File
@@ -0,0 +1,77 @@
-spec s2t_file(Signal) -> AstFile when
Signal :: [tk()],
AstFile :: #ns{meta :: file, kids :: asf()}.
s2t_file([]) ->
error(empty_file);
s2t_file(S0 = [#tk{pos = {_, FileCol}} | _]) ->
Blk0 = s2t_gulp_block(FileCol, S0),
Blk1 = t2t_parse_tds_in_block(Blk0),
#ns{meta = file, kids = [Blk1]}.
-spec s2t_gulp_block(BlkCol, Signal) -> Block when
BlkCol :: pos_integer(),
Signal :: [tk()],
Block :: #ns{meta :: block}.
s2t_gulp_block(BCol, Tks) ->
% sanity check
InBlock = fun(#tk{pos = {_, TCol}}) -> BCol =< TCol end,
true = lists:all(InBlock, Tks),
BlockItems = s2f_block_items(BCol, Tks),
#ns{meta = block, kids = BlockItems}.
-spec s2f_block_items(BCol, Signal) -> BlkItems when
BCol :: pos_integer(),
Signal :: [tk()],
BlkItems :: [BlkItem],
BlkItem :: #ns{meta :: block_item,
kids :: asf()}.
s2f_block_items(BCol, Signal) ->
s2f_block_items(BCol, [], Signal).
s2f_block_items(_BCol, Stk, []) ->
lists:reverse(Stk);
s2f_block_items(BCol, Stk, [#tk{pos = {_, BCol}} = T0 | F0]) ->
{slurp, BlkItem, F1} = s2t_slurp_block_item(BCol, T0, F0),
s2f_block_items(BCol, [BlkItem | Stk], F1).
s2t_slurp_block_item(BCol, T0, F0) ->
{ItemTokens, F1} = s2s_sw_block_item(BCol, T0, F0),
Item = #ns{meta = block_item, kids = ItemTokens},
{slurp, Item, F1}.
% sw = splitwith; kind of take/drop
s2s_sw_block_item(BCol, T0, F0) ->
InItem = fun(#tk{pos = {_, TCol}}) -> BCol < TCol end,
{F0_II, F1} = lists:splitwith(InItem, F0),
{[T0 | F0_II], F1}.
-spec t2t_parse_tds_in_block(Block0) -> Block1 when
Block0 :: ast(),
Block1 :: ast().
% go through and convert the block_item nodes to top
% decls
t2t_parse_tds_in_block(B0 = #ns{meta = block, kids = F0}) ->
F1 = lists:map(fun t2t_parse_td_from_item/1, F0),
B0#ns{kids = F1}.
-spec t2t_parse_td_from_item(BlockItem) -> TopDecl when
BlockItem :: #ns{meta :: block_item},
TopDecl :: #ns{meta :: td_meta()}.
t2t_parse_td_from_item(#ns{meta = block_item, kids = Signal}) ->
s2t_top_decl(Signal).
-spec s2t_top_decl(Signal) -> TdTree when
Signal :: [tk()],
TdTree :: ast().
s2t_top_decl(S0) ->
+292
View File
@@ -0,0 +1,292 @@
# Syntax
## Lexical syntax
### Comments
Single line comments start with `//` and block comments are enclosed in `/*`
and `*/` and can be nested.
### Keywords
```
contract include let switch type record datatype if elif else function
stateful payable true false mod public entrypoint private indexed namespace
interface main using as for hiding
```
### Tokens
- `Id = [a-z_][A-Za-z0-9_']*` identifiers start with a lower case letter.
- `Con = [A-Z][A-Za-z0-9_']*` constructors start with an upper case letter.
- `QId = (Con\.)+Id` qualified identifiers (e.g. `Map.member`)
- `QCon = (Con\.)+Con` qualified constructor
- `TVar = 'Id` type variable (e.g `'a`, `'b`)
- `Int = [0-9]+(_[0-9]+)*|0x[0-9A-Fa-f]+(_[0-9A-Fa-f]+)*` integer literal with optional `_` separators
- `Bytes = #[0-9A-Fa-f]+(_[0-9A-Fa-f]+)*` byte array literal with optional `_` separators
- `String` string literal enclosed in `"` with escape character `\`
- `Char` character literal enclosed in `'` with escape character `\`
- `AccountAddress` base58-encoded 32 byte account pubkey with `ak_` prefix
- `ContractAddress` base58-encoded 32 byte contract address with `ct_` prefix
- `Signature` base58-encoded 64 byte cryptographic signature with `sg_` prefix
Valid string escape codes are
| Escape | ASCII | |
|---------------|-------------|---|
| `\b` | 8 | |
| `\t` | 9 | |
| `\n` | 10 | |
| `\v` | 11 | |
| `\f` | 12 | |
| `\r` | 13 | |
| `\e` | 27 | |
| `\xHexDigits` | *HexDigits* | |
See the [identifier encoding scheme](https://git.qpq.swiss/QPQ-AG/protocol/src/branch/master/node/api/api_encoding.md) for the
details on the base58 literals.
## Layout blocks
Sophia uses Python-style layout rules to group declarations and statements. A
layout block with more than one element must start on a separate line and be
indented more than the currently enclosing layout block. Blocks with a single
element can be written on the same line as the previous token.
Each element of the block must share the same indentation and no part of an
element may be indented less than the indentation of the block. For instance
```sophia
contract Layout =
function foo() = 0 // no layout
function bar() = // layout block starts on next line
let x = foo() // indented more than 2 spaces
x
+ 1 // the '+' is indented more than the 'x'
```
## Notation
In describing the syntax below, we use the following conventions:
- Upper-case identifiers denote non-terminals (like `Expr`) or terminals with
some associated value (like `Id`).
- Keywords and symbols are enclosed in single quotes: `'let'` or `'='`.
- Choices are separated by vertical bars: `|`.
- Optional elements are enclosed in `[` square brackets `]`.
- `(` Parentheses `)` are used for grouping.
- Zero or more repetitions are denoted by a postfix `*`, and one or more
repetitions by a `+`.
- `Block(X)` denotes a layout block of `X`s.
- `Sep(X, S)` is short for `[X (S X)*]`, i.e. a possibly empty sequence of `X`s
separated by `S`s.
- `Sep1(X, S)` is short for `X (S X)*`, i.e. same as `Sep`, but must not be empty.
## Declarations
A Sophia file consists of a sequence of *declarations* in a layout block.
```c
File ::= Block(TopDecl)
TopDecl ::= ['payable'] ['main'] 'contract' Con [Implement] '=' Block(Decl)
| 'contract' 'interface' Con [Implement] '=' Block(Decl)
| 'namespace' Con '=' Block(Decl)
| '@compiler' PragmaOp Version
| 'include' String
| Using
Implement ::= ':' Sep1(Con, ',')
Decl ::= 'type' Id ['(' TVar* ')'] '=' TypeAlias
| 'record' Id ['(' TVar* ')'] '=' RecordType
| 'datatype' Id ['(' TVar* ')'] '=' DataType
| 'let' Id [':' Type] '=' Expr
| (EModifier* 'entrypoint' | FModifier* 'function') Block(FunDecl)
| Using
FunDecl ::= Id ':' Type // Type signature
| Id Args [':' Type] '=' Block(Stmt) // Definition
| Id Args [':' Type] Block(GuardedDef) // Guarded definitions
GuardedDef ::= '|' Sep1(Expr, ',') '=' Block(Stmt)
Using ::= 'using' Con ['as' Con] [UsingParts]
UsingParts ::= 'for' '[' Sep1(Id, ',') ']'
| 'hiding' '[' Sep1(Id, ',') ']'
PragmaOp ::= '<' | '=<' | '==' | '>=' | '>'
Version ::= Sep1(Int, '.')
EModifier ::= 'payable' | 'stateful'
FModifier ::= 'stateful' | 'private'
Args ::= '(' Sep(Pattern, ',') ')'
```
Contract declarations must appear at the top-level.
For example,
```sophia
contract Test =
type t = int
entrypoint add (x : t, y : t) = x + y
```
There are three forms of type declarations: type aliases (declared with the
`type` keyword), record type definitions (`record`) and data type definitions
(`datatype`):
```c
TypeAlias ::= Type
RecordType ::= '{' Sep(FieldType, ',') '}'
DataType ::= Sep1(ConDecl, '|')
FieldType ::= Id ':' Type
ConDecl ::= Con ['(' Sep1(Type, ',') ')']
```
For example,
```sophia
record point('a) = {x : 'a, y : 'a}
datatype shape('a) = Circle(point('a), 'a) | Rect(point('a), point('a))
type int_shape = shape(int)
```
## Types
```c
Type ::= Domain '=>' Type // Function type
| Type '(' Sep(Type, ',') ')' // Type application
| '(' Type ')' // Parens
| 'unit' | Sep(Type, '*') // Tuples
| Id | QId | TVar
Domain ::= Type // Single argument
| '(' Sep(Type, ',') ')' // Multiple arguments
```
The function type arrow associates to the right.
Example,
```sophia
'a => list('a) => (int * list('a))
```
## Statements
Function bodies are blocks of *statements*, where a statement is one of the following
```c
Stmt ::= 'switch' '(' Expr ')' Block(Case)
| 'if' '(' Expr ')' Block(Stmt)
| 'elif' '(' Expr ')' Block(Stmt)
| 'else' Block(Stmt)
| 'let' LetDef
| Using
| Expr
LetDef ::= Id Args [':' Type] '=' Block(Stmt) // Function definition
| Pattern '=' Block(Stmt) // Value definition
Case ::= Pattern '=>' Block(Stmt)
| Pattern Block(GuardedCase)
GuardedCase ::= '|' Sep1(Expr, ',') '=>' Block(Stmt)
Pattern ::= Expr
```
`if` statements can be followed by zero or more `elif` statements and an optional final `else` statement. For example,
```sophia
let x : int = 4
switch(f(x))
None => 0
Some(y) =>
if(y > 10)
"too big"
elif(y < 3)
"too small"
else
"just right"
```
## Expressions
```c
Expr ::= '(' LamArgs ')' '=>' Block(Stmt) // Anonymous function (x) => x + 1
| '(' BinOp ')' // Operator lambda (+)
| 'if' '(' Expr ')' Expr 'else' Expr // If expression if(x < y) y else x
| Expr ':' Type // Type annotation 5 : int
| Expr BinOp Expr // Binary operator x + y
| UnOp Expr // Unary operator ! b
| Expr '(' Sep(Expr, ',') ')' // Application f(x, y)
| Expr '.' Id // Projection state.x
| Expr '[' Expr ']' // Map lookup map[key]
| Expr '{' Sep(FieldUpdate, ',') '}' // Record or map update r{ fld[key].x = y }
| '[' Sep(Expr, ',') ']' // List [1, 2, 3]
| '[' Expr '|' Sep(Generator, ',') ']'
// List comprehension [k | x <- [1], if (f(x)), let k = x+1]
| '[' Expr '..' Expr ']' // List range [1..n]
| '{' Sep(FieldUpdate, ',') '}' // Record or map value {x = 0, y = 1}, {[key] = val}
| '(' Expr ')' // Parens (1 + 2) * 3
| '(' Expr '=' Expr ')' // Assign pattern (y = x::_)
| Id | Con | QId | QCon // Identifiers x, None, Map.member, AELib.Token
| Int | Bytes | String | Char // Literals 123, 0xff, #00abc123, "foo", '%'
| AccountAddress | ContractAddress // Chain identifiers
| Signature // Signature
| '???' // Hole expression 1 + ???
Generator ::= Pattern '<-' Expr // Generator
| 'if' '(' Expr ')' // Guard
| LetDef // Definition
LamArgs ::= '(' Sep(LamArg, ',') ')'
LamArg ::= Id [':' Type]
FieldUpdate ::= Path '=' Expr
Path ::= Id // Record field
| '[' Expr ']' // Map key
| Path '.' Id // Nested record field
| Path '[' Expr ']' // Nested map key
BinOp ::= '||' | '&&' | '<' | '>' | '=<' | '>=' | '==' | '!='
| '::' | '++' | '+' | '-' | '*' | '/' | 'mod' | '^'
| 'band' | 'bor' | 'bxor' | '<<' | '>>' | '|>'
UnOp ::= '-' | '!' | 'bnot'
```
## Operators types
| Operators | Type
| --- | ---
| `-` `+` `*` `/` `mod` `^` | arithmetic operators
| `!` `&&` `\|\|` | logical operators
| `band` `bor` `bxor` `bnot` `<<` `>>` | bitwise operators
| `==` `!=` `<` `>` `=<` `>=` | comparison operators
| `::` `++` | list operators
| `\|>` | functional operators
## Operator precedence
In order of highest to lowest precedence.
| Operators | Associativity
| --- | ---
| `!` `bnot`| right
| `^` | left
| `*` `/` `mod` | left
| `-` (unary) | right
| `+` `-` | left
| `<<` `>>` | left
| `::` `++` | right
| `<` `>` `=<` `>=` `==` `!=` | none
| `band` | left
| `bxor` | left
| `bor` | left
| `&&` | right
| `\|\|` | right
| `\|>` | left
+67
View File
@@ -0,0 +1,67 @@
-spec mktree(Signal) -> Tree when
Signal :: gsc:signal(),
Tree :: gsc_ntree:ntree().
% @doc make into a tree
mktree(Sig) ->
Tree0 = gsc_ntree:nstem(vtokens, Sig),
Tree1 = rerootl_tkstr("=>", Tree0),
Tree2 = rerootl_tkstr("*", Tree1),
Tree2.
rerootl_tkstr(S, Tree0 = #ns{val = Root0}) ->
Kids0 = gsc_ntree:deleaf0(Tree0),
IsntS = fun(Tk) -> isnt_str(S, Tk) end,
case lists:splitwith(IsntS, Kids0) of
% found
% input:
% *s Root0
% |
% +-- .l Foo
% +-- .l "=>"
% +-- .l Bar
% output:
% *s "=>"
% |
% +-- *s Root0 -- .l Foo
% +-- *s Root0 -- .l Bar
{LHS1, [Tk0 | RHS1]} ->
Root1 = Root0,
LTree1 = gsc_ntree:releaf0(Root1, LHS1),
RTree1 = rerootl_tkstr(S, gsc_ntree:releaf0(Root1, RHS1)),
NewRoot0 = {op, Tk0},
NewKids0 = [LTree1, RTree1],
NewTree = gsc_ntree:releaf0(NewRoot0, NewKids0),
NewTree;
% not found, nothing to do
{Kids0, []} ->
Tree0
end.
%reroot_mapsto(Tree0 = #ns{val = Root0}) ->
% Kids0 = gsc_ntree:deleaf0(Tree0),
% IsntMapsto = fun(DL) -> isnt_str("=>", Tk) end,
% case lists:splitwith(IsntMapsto, Kids0) of
% % found
% {LHS1, [Tk0 | RHS1]} ->
% Root1 = Root0,
% LTree1 = gsc_ntree:releaf0(Root1, LHS1),
% RTree1 = reroot_mapsto(gsc_ntree:releaf0(Root1, RHS1)),
% NewRoot0 = {op, Tk0},
% NewKids0 = [LTree1, RTree1],
% NewTree = gsc_ntree:releaf0(NewRoot0, NewKids0),
% NewTree;
% % nothing to do
% {Kids0, []} ->
% Tree0
% end.
isnt_str(X, Y) ->
not is_str(X, Y).
is_str(S, #tk{str = S}) -> true;
is_str(_, _) -> false.
+68 -5
View File
@@ -47,6 +47,8 @@ do(["list", "tests"]) ->
do_tlist();
do(["test"]) ->
do_tests();
do(["test" | Tests]) ->
do_tests(Tests);
do(["tests"]) ->
do_tests();
do(["run", "tests"]) ->
@@ -75,18 +77,79 @@ do(Args) ->
do_doi() ->
FP = zx:get_home() ++ "/priv/doi.txt",
Cmd = "less " ++ FP,
io:format("~s~n", [Cmd]).
page_file(FP).
% thank you chatgpt
% os:cmd didnt do nuffin because that's for running
% stuff in the background and capturing the output, not
% for taking over the screen
page_file(FilePath) ->
Less = os:find_executable("less"),
case Less of
false -> cat_file(FilePath);
_ -> less_file(Less, FilePath)
end.
cat_file(FilePath) ->
{ok, Bytes} = file:read_file(FilePath),
io:format("~ts", [Bytes]).
less_file(Less, FilePath) ->
Port = open_port({spawn_executable, Less},
[{args, [FilePath]},
nouse_stdio, exit_status]),
receive
{Port, {exit_status, 0}} ->
ok;
{Port, {exit_status, N}} ->
error({less_exit_status, N});
{'EXIT', Port, Reason} ->
error(Reason)
end.
do_tests() ->
io:format("TestModules = ~p~n", [known_modules_with_prefix("ts")]),
io:format("TestModules = ~p~n", [test_mods()]),
do_runall_tests().
do_runall_tests() ->
lists:foreach(fun run_mod_main/1, test_mods()).
do_tests(List) ->
lists:foreach(fun run_test/1, List).
% n
run_test(TestName) ->
% we have two candidate atoms
C1 = list_to_atom(TestName),
C2 = list_to_atom("gsc_test_" ++ TestName),
KnownMods = test_mods(),
IsC1 = lists:member(C1, KnownMods),
IsC2 = lists:member(C2, KnownMods),
if
IsC1 -> rmm(C1);
IsC2 -> rmm(C2);
true -> error({no_such_test, TestName})
end.
rmm(X) -> run_mod_main(X).
% KnownTests = test_mods(),
% TestMods = ensure_all_known([], List, KnownTests),
% lists:foreach(fun run_mod_main/1, TestMods).
%ensure_all_known(Acc, [], _) ->
% lists:sort(Acc);
%ensure_all_known(Acc, [T | Ts], Knowns) ->
% case lists:member(T, Knowns) of
%
% end.
test_mods() ->
known_modules_with_prefix("gs_test").
known_modules_with_prefix("gsc_test").
known_modules_with_prefix(Pfx) ->
ModsZipBeamsZipLoaded = code:all_available(),
@@ -134,7 +197,7 @@ do_eshell() ->
end.
tokenizers_agree(File) ->
so_tokens(File) =:= tokens(File).
gso_tokens(File) =:= so_tokens(File).
do_tokens(FilePath) ->
+273
View File
@@ -0,0 +1,273 @@
% @doc experiment centering around the file syntax node using ntree approach
-module(gsc_test_file).
-export([
main/0
]).
-include("$gsc_include/gsc.hrl").
-record(ct,
{payable = none :: none | false | {true, tk()},
main = none :: none | false | {true, tk()},
contract = none :: none | tk(),
con = none :: none | tk(),
impls = none :: none | [tk()],
eq = none :: none | tk()}).
-type meta() :: #ct{}.
-record(decl_type,
{type = none :: none | tk(),
id = none :: none | tk(),
params = none :: none | [tk()],
eq = none :: none | tk()}).
-type decl_meta() :: #decl_type{}.
-type ast_meta() :: file
| meta()
| decl_meta()
| nyi
| {nyi, any()}
.
-type target()
:: ct
| iface
| ns
| pragma
| include
| using
.
-type s2t_target()
:: file
| top_decl
| target()
| nyi
| {nyi, any()}
.
-type s2f_target()
:: {block_of, s2t_target()}
.
-type ast() :: ntree(ast_meta(), tk()).
-type asf() :: nforest(ast_meta(), tk()).
main() ->
HelloN = "hello.aes",
HelloP = ts_utils:ct_file_abspath(HelloN),
{ok, HelloS} = file:read_file(HelloP),
S0 = gsc:unsafe_signal_from_file(HelloP),
T1 = s2t(file, S0),
io:format("hello.aes:~n", []),
io:format("```~n", []),
io:format("~ts", [HelloS]),
io:format("```~n~n", []),
io:format("AST: ~tp~n", [T1]),
ok.
% // Hello World Contract
% // Copyright (c) 2025 QPQ AG
%
% contract Hello =
% type state = unit
% entrypoint init(): state =
% ()
%
% entrypoint hello(): string =
% "hello, world"
-spec s2t(ParseTarget, Signal) -> AST when
ParseTarget :: file,
Signal :: [tk()],
AST :: ast().
% File ::= Block(TopDecl)
s2t(file, Signal) ->
case Signal of
[] -> error(empty_file);
_ -> {ns, file, s2f({block_of, top_decl}, Signal)}
end;
% TopDecl ::= ['payable'] ['main'] 'contract' Con [Implement] '=' Block(Decl)
% | ['payable'] 'contract' 'interface' Con [Implement] '=' Block(Decl)
% | 'namespace' Con '=' Block(Decl)
% | '@compiler' PragmaOp Version
% | 'include' String
% | Using
s2t(top_decl, Signal) ->
NewTarget =
case gsc_tokens:strings(3, Signal) of
["payable", "contract", "interface"] -> iface;
["contract", "interface" | _] -> iface;
["payable", "main", "contract"] -> ct;
["payable", "contract" | _] -> ct;
["contract" | _] -> ct;
["namespace" | _] -> namespace;
["@compiler" | _] -> pragma;
["include" | _] -> include;
["using" | _] -> using
end,
s2t(NewTarget, Signal);
% ['payable'] ['main'] 'contract' Con [Implement] '=' Block(Decl)
s2t(ct, S0) ->
{slurp, CtMeta, S1} = s2s_slurp_meta(#ct{}, S0),
{ns, CtMeta, s2f({block_of, decl}, S1)};
% Decl ::= 'type' Id ['(' TVar* ')'] '=' TypeAlias
% | 'record' Id ['(' TVar* ')'] '=' RecordType
% | 'datatype' Id ['(' TVar* ')'] '=' DataType
% | 'let' Id [':' Type] '=' Expr
% | (EModifier* 'entrypoint' | FModifier* 'function') Block(FunDecl)
% | Using
s2t(decl, S0) ->
NewTarget =
case gsc_tokens:strings(3, S0) of
["type" | _] -> decl_type;
["record" | _] -> decl_record;
["datatype" | _] -> decl_datatype;
["let" | _] -> decl_let;
Pfx3 ->
IsEp = lists:member("entrypoint", Pfx3),
IsFn = lists:member("function", Pfx3),
if
IsEp -> decl_entrypoint;
IsFn -> decl_function;
true -> error({bad_decl, S0})
end
end,
s2t(NewTarget, S0);
% 'type' Id ['(' TVar* ')'] '=' TypeAlias
s2t(decl_type, S0) ->
{slurp, Meta, S1} = s2s_slurp_meta(#decl_type{}, S0),
{ns, Meta, s2t(type, S1)};
s2t(nyi, Signal) ->
{ns, nyi, Signal};
s2t(NYI = {nyi, _}, Signal) ->
{ns, NYI, Signal};
s2t(NYI, Signal) ->
{ns, {nyi, NYI}, Signal}.
-spec s2f(ForestTarget, Signal) -> Forest when
ForestTarget :: s2f_target(),
Signal :: [tk()],
Forest :: asf().
s2f({block_of, TreeTarget}, S0) ->
{gulp, Items} = gsc_signal:gulp_block_items(S0),
[s2t(TreeTarget, I) || I <- Items].
-spec s2s_slurp_meta(InitMeta, Signal) -> Result when
InitMeta :: Meta,
Signal :: [tk()],
Result :: {slurp, Meta, NewSignal},
Meta :: ast_meta(),
NewSignal :: Signal.
s2s_slurp_meta(M = #ct{}, S) ->
s2s_sm_ct(M, S);
s2s_slurp_meta(M = #decl_type{}, S) ->
s2s_sm_decl_type(M, S);
s2s_slurp_meta(M, S) ->
error({s2s_slurp_meta, M, S}).
s2s_sm_ct(Ct = #ct{payable = none}, S0) ->
case S0 of
[#tk{str = "payable"} = T0 | S1] ->
s2s_sm_ct(Ct#ct{payable = {true, T0}}, S1);
_ ->
s2s_sm_ct(Ct#ct{payable = false}, S0)
end;
s2s_sm_ct(Ct = #ct{main = none}, S0) ->
case S0 of
[#tk{str = "main"} = T0 | S1] ->
s2s_sm_ct(Ct#ct{main = {true, T0}}, S1);
_ ->
s2s_sm_ct(Ct#ct{main = false}, S0)
end;
s2s_sm_ct(Ct = #ct{contract = none}, S0) ->
case S0 of
[#tk{str = "contract"} = T0 | S1] ->
s2s_sm_ct(Ct#ct{contract = T0}, S1);
_ ->
error({no_kwd_contract, Ct, S0})
end;
s2s_sm_ct(Ct = #ct{con = none}, S0) ->
case S0 of
[#tk{shape = con} = T0 | S1] ->
s2s_sm_ct(Ct#ct{con = T0}, S1);
_ ->
error({no_contract_name, Ct, S0})
end;
s2s_sm_ct(Ct = #ct{impls = none}, S0) ->
case gsc_tokens:strings(1, S0) of
[":"] ->
{slurp, Impls, S1} = s2f_slurp_impls(S0),
s2s_sm_ct(Ct#ct{impls = Impls}, S1);
_ ->
s2s_sm_ct(Ct#ct{impls = []}, S0)
end;
s2s_sm_ct(Ct = #ct{eq = none}, S0) ->
case S0 of
[#tk{str = "="} = T0 | S1] ->
s2s_sm_ct(Ct#ct{eq = T0}, S1);
_ ->
error({no_equal_sign, Ct, S0})
end;
s2s_sm_ct(Ct, S0) ->
{slurp, Ct, S0}.
s2f_slurp_impls([#tk{str = ":"}, #tk{shape = con} = I0 | S0]) ->
s2f_slurp_impls([I0], S0).
s2f_slurp_impls(Stk, [#tk{str = ","}, #tk{shape = con} = I0 | S0]) ->
s2f_slurp_impls([I0 | Stk], S0);
s2f_slurp_impls(Stk, S0) ->
{slurp, lists:reverse(Stk), S0}.
%-record(decl_type,
% {type = none :: none | tk(),
% id = none :: none | tk(),
% params = none :: none | [tk()],
% eq = none :: none | tk()}).
s2s_sm_decl_type(M = #decl_type{type = none}, S0) ->
case S0 of
[#tk{str = "type"} = T0 | S1] ->
s2s_sm_decl_type(M#decl_type{type = T0}, S1);
_ ->
error({no_kwd_type, S0})
end;
s2s_sm_decl_type(M = #decl_type{id = none}, S0) ->
case S0 of
[#tk{shape = id} = T0 | S1] ->
s2s_sm_decl_type(M#decl_type{id = T0}, S1);
_ ->
error({no_type_id, S0})
end;
s2s_sm_decl_type(M = #decl_type{params = none}, S0) ->
case S0 of
[#tk{str = "("} = T0 | _] ->
error({fixme, parens_bad});
_ ->
s2s_sm_decl_type(M#decl_type{params = []}, S0)
end;
s2s_sm_decl_type(M = #decl_type{eq = none}, S0) ->
case S0 of
[#tk{str = "="} = T0 | S1] ->
s2s_sm_decl_type(M#decl_type{eq = T0}, S1);
_ ->
error({no_equal_sign, S0})
end;
s2s_sm_decl_type(M, S0) ->
{slurp, M, S0}.
+144
View File
@@ -0,0 +1,144 @@
-module(gsc_test_ntree).
-export([
main/0
]).
-include("$gsc_include/gsc.hrl").
% just parsing type expressions right now, so only need
% to worry about round parens
%
% none is to indicate general-purpose grouping, for
% e.g. LHS/RHS of an op
-type syntax_meta()
:: {op, tk()}
| op_arg
| {parens, Open :: tk(), Close :: tk()}
.
-type ast() :: ntree(syntax_meta(), tk()).
-type asf() :: nforest(syntax_meta(), tk()).
-type asts() :: asf().
main() ->
x00(),
ok.
% x00 = example00
x00() ->
io:format("Example 00:~n", []),
io:format(" SrcStr = ~p~n", [x00_src()]),
io:format(" Tokens = ~p~n", [x00_tks()]),
io:format(" Signal = ~p~n", [x00_sgl()]),
io:format(" Forest = ~p~n", [x00_fst()]),
ok.
% sample type expr, tokens, signal
x00_src() -> "(foo => (bar) * baz)".
x00_tks() -> gsc:unsafe_tokens_from_string(x00_src()).
x00_sgl() -> gsc:filter_signal(x00_tks()).
x00_fst() -> parse(x00_sgl()).
-spec parse(Signal) -> ASF when
Signal :: [tk()],
ASF :: asf().
parse(Signal) ->
% key insight here is our signal is already a
% forest, assuming the leaf type is `tk()`.
%
% our parser is a sequence of forest-to-forest
% transformers.
%
% at the end we should end up with just one tree (i
% think)?
F0 = Signal,
F1 = f2f_parens(F0),
F2 = f2f_op("=>", F1),
F3 = f2f_op("*", F2),
Result = F2,
Result.
f2f_op(OpStr, Fst) ->
f2f_op(OpStr, [], Fst).
% never saw the op
f2f_op(_opstr, Stk, []) ->
lists:reverse(Stk);
% see op
f2f_op(OpStr, LhsStk, [#tk{str = OpStr} = OpTk | Rest]) ->
Lhf = lists:reverse(LhsStk),
Rhf = f2f_op(OpStr, Rest),
Lht = #ns{meta = op_arg, kids = Lhf},
Rht = #ns{meta = op_arg, kids = Rhf},
ResultT = #ns{meta = {op, OpTk},
kids = [Lht, Rht]},
ResultF = [ResultT],
ResultF;
% see stem, descend
f2f_op(OpStr, LhsStk, [Ns = #ns{kids = NsKids} | Rest]) ->
NewNsKids = f2f_op(OpStr, NsKids),
NewNs = Ns#ns{kids = NewNsKids},
NewStk = [NewNs | LhsStk],
f2f_op(OpStr, NewStk, Rest);
% see leaf, just add
f2f_op(OpStr, Stk, [L | Rest]) ->
f2f_op(OpStr, [L | Stk], Rest).
-spec f2f_parens(Forest) -> NewForest when
Forest :: asts(),
NewForest :: Forest.
% @doc
% recursive parens decomposition
%
% the input here is the flat list of tokens. here we
% basically replace the string of tokens between `(`
% and `)` with a single tree
%
% interesting quirk is that this doesn't error on too
% many close parens, only too many open parens
f2f_parens(Fst) ->
f2f_parens([], Fst).
% done
f2f_parens(Stk, []) ->
lists:reverse(Stk);
% crawl down the forest and scan for open parens
% open paren, we descend
f2f_parens(Stk, [#tk{str = "("} = TkOpen | Rest0]) ->
InitMeta = {parens, TkOpen, none},
{slurp, PStem, Rest1} = slurp_pstem(InitMeta, [], Rest0),
NewStk = [PStem | Stk],
f2f_parens(NewStk, Rest1);
% something else, we continue
f2f_parens(Stk, [Tree | Rest]) ->
f2f_parens([Tree | Stk], Rest).
% ran out of tokens before close paren
slurp_pstem({parens, TkOpen, none}, Stk, []) ->
error({no_close_for, TkOpen, Stk});
% hit close paren, we done
slurp_pstem({parens, TkOpen, none}, Stk, [TkClose = #tk{str = ")"} | Rest]) ->
FinalMeta = {parens, TkOpen, TkClose},
Midsection = lists:reverse(Stk),
FinalTree = #ns{meta = FinalMeta,
kids = Midsection},
{slurp, FinalTree, Rest};
% hit open paren, we recurse
slurp_pstem(AccMeta, Stk, [TkOpen_II = #tk{str = "("} | Rest0]) ->
InitMeta_II = {parens, TkOpen_II, none},
{slurp, PStem_II, Rest1} = slurp_pstem(InitMeta_II, [], Rest0),
NewStk = [PStem_II | Stk],
slurp_pstem(AccMeta, NewStk, Rest1);
% hit something else, we move along
slurp_pstem(AccMeta, Stk, [Tree | Rest]) ->
slurp_pstem(AccMeta, [Tree | Stk], Rest).
@@ -1,5 +1,5 @@
% gsc tokenizer tests
-module(gs_test_tokens).
-module(gsc_test_tokens).
-export([
main/0, ct_dir/0
@@ -116,11 +116,10 @@ tokstr_concat_test_() ->
concat_property(FileName, FilePath) ->
%?debugFmt("concat_property(~p, _)", [FileName]),
{ok, FileBytes} = file:read_file(FilePath),
FileChars = unicode:characters_to_nfc_list(FileBytes),
FileChars = gsc:very_stable_file(FilePath),
{FileName ++ ": file = sum(tokens)",
fun() ->
case gsc_tokenizer:tokens(FileChars) of
case gsc:tokens_from_file(FileChars) of
{ok, SfcTokens} ->
ConcatStr = concat_token_strs(SfcTokens, []),
?assertEqual(FileChars, ConcatStr);
@@ -139,13 +138,15 @@ div_test_() ->
% divergence
DivFiles = div_files(),
%?debugFmt("DivFiles=~p", [DivFiles]),
{"claude tokenizer divergences fixed", [tokens_match(N, P) || {N, P} <- DivFiles]}.
{"claude tokenizer divergences fixed",
[tokens_match(N, P) || {N, P} <- DivFiles]}.
tokens_match(FileName, FilePath) ->
%?debugFmt("tokens_match(~p, _)", [FileName]),
% extracting data to be tested
SoTokens = gsc_cli:so_tokens(FilePath),
SfTokens = gsc_cli:gso_tokens(FilePath),
% i hate this so much but lazy and this is test code so who really cares.
SoTokens = so_tokens_from_file(FilePath),
SfTokens = gsc:gso_tokens_from_file(FilePath),
{FileName ++ ": tokenizers_agree",
fun() ->
case {SoTokens, SfTokens} of
@@ -155,3 +156,10 @@ tokens_match(FileName, FilePath) ->
{{error, _}, {ok, _}} -> error("so_scan failed and gso_scan succeded")
end
end}.
% that's right, we have to enter via converting the
% bytes in the file to a list... lol
so_tokens_from_file(F) ->
{ok, Bytes} = file:read_file(F),
S = binary_to_list(Bytes),
so_scan:scan(S).
+27
View File
@@ -0,0 +1,27 @@
% testing utilities
-module(ts_utils).
-export([
ct_dir/0,
ct_file/1, ct_file_abspath/1
]).
-spec ct_dir() -> string().
% directory containing the tests for the tokenizer
ct_dir() ->
zx_daemon:get_home() ++ "/ct".
ct_file_abspath(Name) ->
ct_file(Name).
-spec ct_file(Name) -> AbsPath when
Name :: string(),
AbsPath :: string().
% @doc
% ct_file("foo.aes") -> "/path/to/ct/foo.aes"
ct_file(Name) ->
ct_dir() ++ "/" ++ Name.
+32
View File
@@ -143,3 +143,35 @@
| #gsc_err_nyi{}
| #gsc_err_empty_file{}
| #gsc_err{}.
%----------------------------
% tree type for parsing
%----------------------------
% @doc stem record
-record(ns, {meta :: any(),
kids :: list(any())}).
% @doc `ntree(S, L)' is a "node tree" (meaning stems
% have values and children)
%
% for the purposes of the compiler, the key observation
% is that a flat list of tokens is already a forest
-type ntree(S, L) :: #ns{meta :: S, kids :: [ntree(S, L)]}
| L.
% @doc forest is just a list of trees
-type nforest(S, L) :: [ntree(S, L)].
% aliases
-type nt(S, L) :: ntree(S, L).
-type nf(S, L) :: nforest(S, L).
-type ntree() :: ntree(any(), any()).
-type nforest() :: [ntree()].
-type nt() :: ntree().
-type nf() :: nforest().
+9 -9
View File
@@ -196,13 +196,13 @@
%gulp_file([]) ->
% {error, empty_file};
%gulp_file(Tokens) ->
% case gs_tokens:take_block(Tokens) of
% case gsc_tokens:take_block(Tokens) of
% {Tokens, []} ->
% gulp_block(fun gulp_top_decl/1, Tokens);
% %gulp_file2([], [], Tokens);
% {A, B} ->
% StartPos = gs_tokens:start_pos(A),
% ErrPos = gs_tokens:start_pos(B),
% StartPos = gsc_tokens:start_pos(A),
% ErrPos = gsc_tokens:start_pos(B),
% Msg = efmt("gulp_file: block starting at ~p ends at ~p instead of EOF",
% [StartPos, ErrPos]),
% {error, #parse_error{pos = ErrPos, msg = Msg}}
@@ -212,7 +212,7 @@
%
%%gulp_file2(AccOks, AccErrs, Tokens = [_ | _]) ->
%% % ItemTokens will be nonempty
%% {ItemTokens, NewTokens} = gs_tokens:take_block_item(Tokens),
%% {ItemTokens, NewTokens} = gsc_tokens:take_block_item(Tokens),
%% case gulp_top_decl(ItemTokens) of
%% {gulp, Ok} -> gulp_file2([Ok | AccOks], AccErrs, NewTokens);
%% Err -> gulp_file2(AccOks, [Err | AccErrs], NewTokens)
@@ -258,7 +258,7 @@
%
%gulp_block(GulpItem, AccOks, AccErrs, Tokens = [_ | _]) ->
% % ItemTokens will be nonempty
% {ItemTokens, NewTokens} = gs_tokens:take_block_item(Tokens),
% {ItemTokens, NewTokens} = gsc_tokens:take_block_item(Tokens),
% case GulpItem(ItemTokens) of
% {gulp, Ok} -> gulp_block(GulpItem, [Ok | AccOks], AccErrs, NewTokens);
% Err -> gulp_block(GulpItem, AccOks, [Err | AccErrs], NewTokens)
@@ -284,7 +284,7 @@
%% | Using
%% @end
%gulp_top_decl(DeclTokens) ->
% case gs_tokens:strings(3, DeclTokens) of
% case gsc_tokens:strings(3, DeclTokens) of
% ["payable", "contract", "interface"] ->
% gulp_nyi(DeclTokens);
% ["contract", "interface" | _] ->
@@ -410,7 +410,7 @@
%% | (EModifier* 'entrypoint' | FModifier* 'function') Block(FunDecl)
%% | Using
%gulp_decl(Tokens) ->
% case gs_tokens:strings(1, Tokens) of
% case gsc_tokens:strings(1, Tokens) of
% ["type"] -> gulp_type_alias(Tokens);
% _ -> gulp_nyi(Tokens)
% end.
@@ -611,7 +611,7 @@
%% Type1 = {plist, Types} () (foo) (foo, bar)
%% | {token, #tk{}} foo Bar.baz 'quux
%slurp_type1(Tks) ->
% case gs_tokens:slurp_plist(Tks) of
% case gsc_tokens:slurp_plist(Tks) of
% % head token is NOT open paren -> must be id/qid/tvar
% {slurp, [], [Tk | NewTks]} ->
% TkType = Tk#tk.type,
@@ -633,7 +633,7 @@
%
%
%%slurp_type_expr_plist(Tks) ->
%% case gs_tokens:slurp_plist(Tks) of
%% case gsc_tokens:slurp_plist(Tks) of
%% % head token is NOT open paren -> must be id/qid/tvar
%% {slurp, [], [Tk | NewTks]} ->
%% TkType = Tk#tk.type,
+3 -3
View File
@@ -43,7 +43,7 @@
%
%% @doc for testing
%unsafe_vtks_from_string(S) ->
% {ok, SigTks} = gs_tokens:significant_tokens(S),
% {ok, SigTks} = gsc_tokens:significant_tokens(S),
% {gulp, Vtks} = gulp_vtks(SigTks),
% Vtks.
%
@@ -110,7 +110,7 @@
% end.
%
%slurp_plist_rec(Tokens = [#tk{string = "(" | _]) ->
% case gs_tokens:slurp_plist(Tokens) of
% case gsc_tokens:slurp_plist(Tokens) of
% {slurp, [], _} ->
% barf;
% {slurp, PTokens, NewTokens} ->
@@ -156,7 +156,7 @@
% {_Pfx = Tks1_BeforeOpen,
% _Sfx = Tks2_OpenNAfter
% = [#tk{string = "("} | _]} ->
% case gs_tokens:slurp_plist(Tks2_OpenNAfter) of
% case gsc_tokens:slurp_plist(Tks2_OpenNAfter) of
% {slurp, Tks2A_OpenToClose, Tks2B_AfterClose} ->
% NewAcc = [Acc,
% Tks1_BeforeOpen,
+1 -1
View File
@@ -63,7 +63,7 @@
%-spec end_pos([gsc_token()]) -> {value, tk_pos()} | none.
%
%end_pos([#gsc_token{pos = Pos, string = Str}]) ->
% {value, gs_tokens:new_pos(Pos, Str)};
% {value, gsc_tokens:new_pos(Pos, Str)};
%end_pos([_ | T]) ->
% end_pos(T);
%end_pos([]) ->
+52
View File
@@ -0,0 +1,52 @@
% @doc
% <pre>
% T R O N A L D D U M P
%
% .-""""""""""""-.
% .-' _..------.._ '-.
% .' .' GOLDEN NFC '. '.
% / / COMB-OVER MAP \ \
% ; ; .-^^^^^^^^^^-. ; ;
% | | / THEY'RE \ | |
% | | | NOT SENDING | | |
% | | | ASCII | | |
% ; ; \_.--. .--._./ ; ;
% \ \ (o)(o) / /
% '. '. __ .' .'
% '-._ '._==_.' _.-'
% '-._____.-'
% /|||\
% / ||| \
% / ||| \
% .-------' ||| '-------.
% / THE BEST NORMALIZER \
% / VERY STABLE CODEPOINTS \
% /_________________________________\
% </pre>
%
% When unicode sends its codepoints, they're not
% sending their best. They're not sending ASCII.
% They're not sending ASCII. They're sending integers
% that have lots of problems, and they're bringing
% those problems with us. They're bringing diacritics.
% They're bringing non-idempotent lowercasing. They're
% bringing graphemes that don't correspond bijectively
% with printable characters. They're bringing RTL.
% They're bringing invisible characters. They're
% bringing characters that draw outside the character
% boundary. They're bringing variable-width
% whitespace. They're bringing control characters.
% They're bringing emojis.
%
% And some, I assume, are good characters.
%
% `SrcStr' is a unicode NFC list, not an ordinary
% string. you think a string is a list of codepoints.
%
% NOOOOO.
%
% See it's different, because that's why.
%
% This is the cost of diversity, folks.
% @end
+137 -34
View File
@@ -1,45 +1,48 @@
% @doc bikeshed proctrastination head into vim warmup thing
% @doc bikeshed proctrastination head into vim warmup
% thing
%
% sophia compiler from scratch by PRH
%
% based on original sophia compiler
%
% parse layers:
% 1. gs_tokens: SrcStr -> (Tokens | SigTokens)
%
% SigTokens = not comment/whitespace
%
% layers:
% a. gs_strmatch : matches string shapes
% b. gso_scan : converts to so_scan shapes
%
%
% terminology:
%
% - `slurp`/`barf` borrowed from emacs paredit mode:
%
% slurp : (a b) c -> (a b c)
% barf : (a b c) -> a (b c)
%
% * `slurp` usually involves *transforming* input
% into a new type (e.g. slurp a token from src
% string); think of slurp as a verb meaning to
% consume and then digest
% * `barf` basically means blindly splitting off
% input
%
% based on original sophia compiler; target for version
% 0.1 is to match behavior exactly
% @end
-module(gsc).
% token and tokens
-export_type([
token/0
token/0,
signal/0
]).
% syntax tree/forest wrapper type
-export_type([
ntree/2, ntree/0,
nforest/2, nforest/0,
nt/2, nt/0,
nf/2, nf/0
]).
-export([
unsafe_tokens_from_file/1,
unsafe_tokens_from_string/1,
unsafe_signal_from_file/1,
unsafe_signal_from_string/1,
filter_signal/1,
signal_from_string/1,
signal_from_file/1,
sigtokens_from_file/1,
sigtokens_from_string/1,
tokens_from_file/1,
tokens_from_string/1
tokens_from_string/1,
% sophia compatibility
gso_tokens_from_file/1,
gso_tokens_from_string/1,
% unicode normalization
very_stable_codepoints/1,
very_stable_string/1,
very_stable_file/1
]).
-include("$gsc_include/gsc.hrl").
@@ -50,19 +53,52 @@
-type token() :: tk().
% @doc signal means non-noise (whitespace/comment)
% tokens; legacy name still around is "sigtokens"
-type signal() :: [tk()].
%-----------------------------------------
% functions
% API: FUNCTIONS
%-----------------------------------------
%-----------------------------------------
% aint nobody got time for case shit
%-----------------------------------------
% tokens
unsafe_tokens_from_file(F) ->
{ok, Tks} = tokens_from_file(F),
Tks.
unsafe_tokens_from_string(S) ->
{ok, Tks} = tokens_from_string(S),
Tks.
% signal
unsafe_signal_from_file(F) ->
{ok, Tks} = signal_from_file(F),
Tks.
unsafe_signal_from_string(S) ->
{ok, Tks} = signal_from_string(S),
Tks.
%
filter_signal(X) -> gsc_tokens:filter_significant(X).
signal_from_file(X) -> sigtokens_from_file(X).
signal_from_string(X) -> sigtokens_from_string(X).
% @doc legacy name for signal
sigtokens_from_file(X) ->
case tokens_from_file(X) of
{ok, Y} -> {ok, gs_tokens:filter_significant(Y)};
{ok, Y} -> {ok, gsc_tokens:filter_significant(Y)};
Err -> Err
end.
sigtokens_from_string(X) ->
case tokens_from_string(X) of
{ok, Y} -> {ok, gs_tokens:filter_significant(Y)};
{ok, Y} -> {ok, gsc_tokens:filter_significant(Y)};
Err -> Err
end.
@@ -81,7 +117,6 @@ tokens_from_file(FilePath) ->
-spec tokens_from_string(SrcStr) -> Result
when SrcStr :: string(),
Result :: {ok, Tokens}
@@ -89,4 +124,72 @@ tokens_from_file(FilePath) ->
Tokens :: [tk()].
tokens_from_string(SrcStr) ->
gs_tokens:tokens(SrcStr).
gsc_tokens:tokens(SrcStr).
-spec gso_tokens_from_file(FilePath) -> Result when
FilePath :: string(),
Result :: {ok, GsoTks} | {error, Reason},
GsoTks :: [gso_scan:so_token()],
Reason :: gsc_err() | any().
gso_tokens_from_file(FilePath) ->
case file:read_file(FilePath) of
{ok, Bytes} -> gso_tokens_from_string(Bytes);
Error -> Error
end.
-spec gso_tokens_from_string(Str) -> Result when
Str :: iolist(),
Result :: {ok, GsoTks} | {error, Reason},
GsoTks :: [gso_scan:so_token()],
Reason :: gsc_err() | any().
gso_tokens_from_string(Evil) ->
Str = gsc_tokens:very_stable_codepoints(Evil),
gso_scan:scan(Str).
-spec very_stable_codepoints(String) -> Normalized when
String :: iolist(),
Normalized :: string().
%% @doc normalize string to utf8 NFC list form
very_stable_codepoints(X) ->
gsc_tokens:very_stable_codepoints(X).
-spec very_stable_string(String) -> Normalized when
String :: iolist(),
Normalized :: string().
%% @doc alias for `very_stable_codepoints/1'
very_stable_string(X) ->
gsc_tokens:very_stable_codepoints(X).
-spec very_stable_file(FilePath) -> Contents when
FilePath :: string(),
Contents :: string().
%% @doc Read file, return contents as
%% `unicode:characters_to_nfc_list/1' list.
%%
%% Please note that this function is NOT in fact very
%% stable, as it throws an error if there's some error
%% reading the file (e.g. not found).
%%
%% this function exists mostly for scripting/shell
%% convenience
very_stable_file(X) ->
case file:read_file(X) of
{ok, B} -> very_stable_codepoints(B);
Error -> error(Error)
end.
+55
View File
@@ -0,0 +1,55 @@
-module(gsc_ntree).
-export([
nstem/2, meta/1, kids/1,
flatten_tree/1, flatten_forest/1
]).
-include("$gsc_include/gsc.hrl").
%%=====================================================
%% API: functions
%%=====================================================
-spec nstem(Root, Forest) -> Tree when
Root :: S,
Forest :: nforest(S, L),
Tree :: ntree(S, L),
S :: any(),
L :: any().
nstem(Root, List) ->
{ns, Root, List}.
meta(#ns{meta = M}) -> M.
kids(#ns{kids = K}) -> K.
-spec flatten_tree(Tree) -> Leafs when
Tree :: ntree(_, L),
Leafs :: [L],
L :: any().
flatten_tree(T) ->
lists:flatten(ft(T)).
-spec flatten_forest(Forest) -> Leafs when
Forest :: nforest(_, L),
Leafs :: [L],
L :: any().
flatten_forest(F) ->
lists:flatten(ff(F)).
ft(#ns{kids = F}) -> ff(F);
ft(Leaf) -> [Leaf].
ff(F) ->
[ft(T) || T <- F].
+111
View File
@@ -0,0 +1,111 @@
% signal = non-noisy tokens
-module(gsc_signal).
-export([
from_tokens/1,
is_block/1,
gulp_block_items/1,
block_to_items/1,
take_block_item/1
]).
-include("$gsc_include/gsc.hrl").
-spec from_tokens(Tokens) -> Signal when
Tokens :: [tk()],
Signal :: [tk()].
% @doc filter out comments/whitespace
from_tokens(Tokens) ->
gsc_tokens:filter_significant(Tokens).
-spec is_block(Signal) -> Result when
Signal :: [tk()],
Result :: boolean().
is_block([]) ->
true;
is_block([#tk{pos = {_, BCol}} | Rest]) ->
InBlock =
fun(#tk{pos = {_, TCol}}) ->
BCol =< TCol
end,
lists:all(InBlock, Rest).
-spec gulp_block_items(Signal) -> Result when
Signal :: [tk()],
Result :: {slurp, Items, NewSignal}
| {error, any()},
Items :: [Signal],
NewSignal :: Signal.
gulp_block_items(S) ->
case is_block(S) of
true -> {gulp, block_to_items(S)};
false -> find_badness(S)
end.
find_badness([#tk{pos = {_, StartCol}} = StartTk | Rest]) ->
find_badness(StartCol, StartTk, Rest).
find_badness(StartCol, StartTk, [#tk{pos = {_, TkCol}} = Tk | Rest]) ->
Bad = TkCol < StartCol,
case Bad of
false -> find_badness(StartCol, StartTk, Rest);
true -> {error, {bad_block, [{start_col, StartCol},
{end_col, TkCol},
{start_tk, StartTk},
{end_tk, Tk}]}}
end.
-spec block_to_items(Signal) -> BlockItems when
Signal :: [tk()],
BlockItems :: [Signal].
% @doc
% naive algorithm, so doesn't ensure all block items
% are same indent level
%
% Input:
% foo = ...
% bar = ...
% baz = ...
%
% Output:
% [foo = ...,
% bar = ...,
% baz = ...]
block_to_items([]) ->
[];
block_to_items(S) ->
b2is([], S).
b2is(Acc, []) ->
lists:reverse(Acc);
b2is(Acc, S) ->
{Item, S1} = take_block_item(S),
b2is([Item | Acc], S1).
-spec take_block_item(Signal) -> Result when
Signal :: [tk()],
Result :: {Item, NewSignal},
Item :: Signal,
NewSignal :: Signal.
take_block_item([]) ->
{[], []};
take_block_item([#tk{pos = {_, ICol}} = T0 | S0]) ->
InItem =
fun(#tk{pos = {_, TCol}}) ->
ICol < TCol
end,
{S0_II, S1} = lists:splitwith(InItem, S0),
{[T0 | S0_II], S1}.
+1 -1
View File
@@ -70,7 +70,7 @@
% `contract` gets tokenized as a keyword and not a variable name), and then
% calls into this module in order to match the string shape it's looking for.
% @end
-module(gs_strmatch).
-module(gsc_strmatch).
%-compile([export_all, nowarn_export_all]).
+55 -27
View File
@@ -16,7 +16,7 @@
% 2. to future-proof in case we decide to incrementally incorporate the gsc
% code into the legacy sophia compiler
% @end
-module(gs_tokens).
-module(gsc_tokens).
% meta
-export([
@@ -39,6 +39,9 @@
is_significant/1,
filter_significant/1,
significant_tokens/1,
very_stable_codepoints/1,
very_stable_string/1,
very_stable_characters/1,
tokens_from_iolist/1,
tokens/1,
slurp_token/2,
@@ -188,13 +191,13 @@ slurp_dlist(All, Opens, [#tk{str = "["} = Tk | NewTks]) ->
slurp_dlist(All, Opens, [#tk{str = "{"} = Tk | NewTks]) ->
slurp_dlist([Tk | All], [Tk | Opens], NewTks);
% sad: mismatch cases
slurp_dlist(All, Opens, []) ->
slurp_dlist(_, Opens, []) ->
{error, {fixme, mismatch, Opens, none}};
slurp_dlist(All, Opens, [#tk{str = "}"} = BadClose | _]) ->
slurp_dlist(_, Opens, [#tk{str = "}"} = BadClose | _]) ->
{error, {fixme, mismatch, Opens, {value, BadClose}}};
slurp_dlist(All, Opens, [#tk{str = "]"} = BadClose | _]) ->
slurp_dlist(_, Opens, [#tk{str = "]"} = BadClose | _]) ->
{error, {fixme, mismatch, Opens, {value, BadClose}}};
slurp_dlist(All, Opens, [#tk{str = ")"} = BadClose | _]) ->
slurp_dlist(_, Opens, [#tk{str = ")"} = BadClose | _]) ->
{error, {fixme, mismatch, Opens, {value, BadClose}}};
% general case: non-terminal token gets pushed
slurp_dlist(All, Opens, [Tk | NewTks]) ->
@@ -330,6 +333,29 @@ is_significant(#tk{shape = ws}) -> false;
is_significant(_) -> true.
% aliases
very_stable_string(X) -> very_stable_codepoints(X).
very_stable_characters(X) -> very_stable_codepoints(X).
-spec very_stable_codepoints(IoList) -> NfcList when
IoList :: iolist(),
NfcList :: string().
%% @doc When Unicode sends its characters, they're not
%% sending their best. They're not sending ASCII.
%% They're not sending ASCII. They're sending
%% characters that have lots of problems, and they're
%% bringing those problems with us. They're bringing
%% diacritics. They're bringing homoglyphs. They're
%% bringing RTL. They're rapists. And some, we assume,
%% are good characters.
very_stable_codepoints(S) ->
unicode:characters_to_nfc_list(S).
-spec tokens_from_iolist(SrcStr) -> Result when
SrcStr :: iolist(),
Result :: {ok, Tokens}
@@ -341,6 +367,7 @@ tokens_from_iolist(S) -> tokens(S).
-spec tokens(SrcStr) -> Result
when SrcStr :: iolist(),
Result :: {ok, Tokens}
@@ -355,7 +382,8 @@ tokens_from_iolist(S) -> tokens(S).
tokens(S) ->
% defensive normalization
tokens([], {1, 1}, unicode:characters_to_nfc_list(S)).
tokens([], {1, 1}, very_stable_codepoints(S)).
tokens(Stack, _FinalPos, "") ->
{ok, lists:reverse(Stack)};
@@ -559,8 +587,8 @@ slurp_token_of_shape(bcom, Pos, SrcStr0) ->
no_tokmatch
end;
slurp_token_of_shape(ws, Pos, SrcStr) ->
WhitespaceMatcher = gs_strmatch:smr_sf_ws(),
case gs_strmatch:match(WhitespaceMatcher, SrcStr) of
WhitespaceMatcher = gsc_strmatch:smr_sf_ws(),
case gsc_strmatch:match(WhitespaceMatcher, SrcStr) of
no_strmatch ->
no_tokmatch;
{strmatch, WS, Rest} ->
@@ -594,7 +622,7 @@ slurp_token_of_shape(kwd, Pos, SrcStr) ->
no_tokmatch
end;
slurp_token_of_shape(op, Pos, SrcStr) ->
case gs_strmatch:match(gs_strmatch:smr_sf_op(), SrcStr) of
case gsc_strmatch:match(gsc_strmatch:smr_sf_op(), SrcStr) of
{strmatch, Str, Rest} ->
Token = #tk{shape = op, pos = Pos, str = Str},
{tokmatch, Token, Rest};
@@ -602,7 +630,7 @@ slurp_token_of_shape(op, Pos, SrcStr) ->
no_tokmatch
end;
slurp_token_of_shape(punct, Pos, SrcStr) ->
case gs_strmatch:match(gs_strmatch:smr_sf_punct(), SrcStr) of
case gsc_strmatch:match(gsc_strmatch:smr_sf_punct(), SrcStr) of
{strmatch, Str, Rest} ->
Token = #tk{shape = punct, pos = Pos, str = Str},
{tokmatch, Token, Rest};
@@ -611,7 +639,7 @@ slurp_token_of_shape(punct, Pos, SrcStr) ->
end;
% SOPHIA VARIABLE NAMES: id, con, qid, qcon, tvar
slurp_token_of_shape(id, Pos, SrcStr) ->
case gs_strmatch:match(gs_strmatch:smr_sf_id(), SrcStr) of
case gsc_strmatch:match(gsc_strmatch:smr_sf_id(), SrcStr) of
{strmatch, IdStr, Rest} ->
Token = #tk{shape = id, pos = Pos, str = IdStr},
{tokmatch, Token, Rest};
@@ -619,7 +647,7 @@ slurp_token_of_shape(id, Pos, SrcStr) ->
no_tokmatch
end;
slurp_token_of_shape(con, Pos, SrcStr) ->
case gs_strmatch:match(gs_strmatch:smr_sf_con(), SrcStr) of
case gsc_strmatch:match(gsc_strmatch:smr_sf_con(), SrcStr) of
{strmatch, Str, Rest} ->
Token = #tk{shape = con, pos = Pos, str = Str},
{tokmatch, Token, Rest};
@@ -627,7 +655,7 @@ slurp_token_of_shape(con, Pos, SrcStr) ->
no_tokmatch
end;
slurp_token_of_shape(qid, Pos, SrcStr) ->
case gs_strmatch:match(gs_strmatch:smr_sf_qid(), SrcStr) of
case gsc_strmatch:match(gsc_strmatch:smr_sf_qid(), SrcStr) of
{strmatch, Str, Rest} ->
Token = #tk{shape = qid, pos = Pos, str = Str},
{tokmatch, Token, Rest};
@@ -635,7 +663,7 @@ slurp_token_of_shape(qid, Pos, SrcStr) ->
no_tokmatch
end;
slurp_token_of_shape(qcon, Pos, SrcStr) ->
case gs_strmatch:match(gs_strmatch:smr_sf_qcon(), SrcStr) of
case gsc_strmatch:match(gsc_strmatch:smr_sf_qcon(), SrcStr) of
{strmatch, Str, Rest} ->
Token = #tk{shape = qcon, pos = Pos, str = Str},
{tokmatch, Token, Rest};
@@ -643,7 +671,7 @@ slurp_token_of_shape(qcon, Pos, SrcStr) ->
no_tokmatch
end;
slurp_token_of_shape(tvar, Pos, SrcStr) ->
case gs_strmatch:match(gs_strmatch:smr_sf_tvar(), SrcStr) of
case gsc_strmatch:match(gsc_strmatch:smr_sf_tvar(), SrcStr) of
{strmatch, Str, Rest} ->
Token = #tk{shape = tvar, pos = Pos, str = Str},
{tokmatch, Token, Rest};
@@ -651,7 +679,7 @@ slurp_token_of_shape(tvar, Pos, SrcStr) ->
no_tokmatch
end;
slurp_token_of_shape(int16, Pos, SrcStr) ->
case gs_strmatch:match(gs_strmatch:smr_sf_int16(), SrcStr) of
case gsc_strmatch:match(gsc_strmatch:smr_sf_int16(), SrcStr) of
{strmatch, Str, Rest} ->
Token = #tk{shape = int16, pos = Pos, str = Str},
{tokmatch, Token, Rest};
@@ -659,7 +687,7 @@ slurp_token_of_shape(int16, Pos, SrcStr) ->
no_tokmatch
end;
slurp_token_of_shape(int10, Pos, SrcStr) ->
case gs_strmatch:match(gs_strmatch:smr_sf_int10(), SrcStr) of
case gsc_strmatch:match(gsc_strmatch:smr_sf_int10(), SrcStr) of
{strmatch, Str, Rest} ->
Token = #tk{shape = int10, pos = Pos, str = Str},
{tokmatch, Token, Rest};
@@ -671,8 +699,8 @@ slurp_token_of_shape(int10, Pos, SrcStr) ->
%
% char: sophia char literal
slurp_token_of_shape(ak, Pos, SrcStr) ->
StringMatcher = gs_strmatch:smr_sf_ak(),
case gs_strmatch:match(StringMatcher, SrcStr) of
StringMatcher = gsc_strmatch:smr_sf_ak(),
case gsc_strmatch:match(StringMatcher, SrcStr) of
no_strmatch ->
no_tokmatch;
{strmatch, TokenStr, Rest} ->
@@ -680,8 +708,8 @@ slurp_token_of_shape(ak, Pos, SrcStr) ->
{tokmatch, Token, Rest}
end;
slurp_token_of_shape(ct, Pos, SrcStr) ->
StringMatcher = gs_strmatch:smr_sf_ct(),
case gs_strmatch:match(StringMatcher, SrcStr) of
StringMatcher = gsc_strmatch:smr_sf_ct(),
case gsc_strmatch:match(StringMatcher, SrcStr) of
no_strmatch ->
no_tokmatch;
{strmatch, TokenStr, Rest} ->
@@ -689,8 +717,8 @@ slurp_token_of_shape(ct, Pos, SrcStr) ->
{tokmatch, Token, Rest}
end;
slurp_token_of_shape(sg, Pos, SrcStr) ->
StringMatcher = gs_strmatch:smr_sf_sg(),
case gs_strmatch:match(StringMatcher, SrcStr) of
StringMatcher = gsc_strmatch:smr_sf_sg(),
case gsc_strmatch:match(StringMatcher, SrcStr) of
no_strmatch ->
no_tokmatch;
{strmatch, TokenStr, Rest} ->
@@ -698,8 +726,8 @@ slurp_token_of_shape(sg, Pos, SrcStr) ->
{tokmatch, Token, Rest}
end;
slurp_token_of_shape(char, Pos, SrcStr) ->
StringMatcher = gs_strmatch:smr_sf_char(),
case gs_strmatch:match(StringMatcher, SrcStr) of
StringMatcher = gsc_strmatch:smr_sf_char(),
case gsc_strmatch:match(StringMatcher, SrcStr) of
no_strmatch ->
no_tokmatch;
{strmatch, TokenStr, Rest} ->
@@ -707,7 +735,7 @@ slurp_token_of_shape(char, Pos, SrcStr) ->
{tokmatch, Token, Rest}
end;
slurp_token_of_shape(string, Pos, SrcStr) ->
case gs_strmatch:match(gs_strmatch:smr_sf_str(), SrcStr) of
case gsc_strmatch:match(gsc_strmatch:smr_sf_str(), SrcStr) of
no_strmatch ->
no_tokmatch;
{strmatch, TokenStr, Rest} ->
@@ -715,7 +743,7 @@ slurp_token_of_shape(string, Pos, SrcStr) ->
{tokmatch, Token, Rest}
end;
slurp_token_of_shape(bytes, Pos, SrcStr) ->
case gs_strmatch:match(gs_strmatch:smr_sf_bytes(), SrcStr) of
case gsc_strmatch:match(gsc_strmatch:smr_sf_bytes(), SrcStr) of
no_strmatch ->
no_tokmatch;
{strmatch, TokenStr, Rest} ->
+2 -2
View File
@@ -1,6 +1,6 @@
% @doc compatibility layer to test against so_scan
%
% converts gs_tokens data to so_scan tokens
% converts gsc_tokens data to so_scan tokens
%
% Ref: so_scan.erl
-module(gso_scan).
@@ -104,7 +104,7 @@
% @end
scan(SrcStr) ->
case gs_tokens:tokens(SrcStr) of
case gsc_tokens:tokens(SrcStr) of
{ok, SfLTokens} ->
SoTokens = to_so_tokens(SfLTokens),
{ok, SoTokens};