882 lines
24 KiB
Erlang
882 lines
24 KiB
Erlang
% @doc
|
|
% A string matcher is roughly analogous to a regex. It describes a pattern,
|
|
% which a string may or may not match.
|
|
%
|
|
% This module is essentially a pure erlang implementation of the subset of
|
|
% regular expressions that are needed to tokenize sophia.
|
|
%
|
|
% The intent for now (May 2026) is simply to perfectly mimic the so_scan library
|
|
%
|
|
% Reference is `docs/sophia_syntax.md` as well as `src/so_scan_lib.erl` in
|
|
% original sophia lib
|
|
%
|
|
% From docs/sophia_syntax.md:
|
|
%
|
|
% - Id = [a-z_][A-Za-z0-9_']* identifiers start with a lower case letter.
|
|
% - Con = [A-Z][A-Za-z0-9_]* constructors start with an upper case letter.
|
|
% - QId = (Con\.)+Id qualified identifiers (e.g. `Map.member`)
|
|
% - QCon = (Con\.)+Con qualified constructor
|
|
% - TVar = 'Id type variable (e.g `'a`, `'b`)
|
|
% - Int = [0-9]+(_[0-9]+)*|0x[0-9A-Fa-f]+(_[0-9A-Fa-f]+)* integer literal with optional `_` separators
|
|
% - Bytes = #[0-9A-Fa-f]+(_[0-9A-Fa-f]+)* byte array literal with optional `_` separators
|
|
% - String` string literal enclosed in " with escape character `\`
|
|
% - Char character literal enclosed in ' with escape character `\`
|
|
% - AccountAddress base58-encoded 32 byte account pubkey with `ak_` prefix
|
|
% - ContractAddress base58-encoded 32 byte contract address with `ct_` prefix
|
|
% - Signature base58-encoded 64 byte cryptographic signature with `sg_` prefix
|
|
%
|
|
% Sophia's notion of tokens also includes keywords, parens, whitespace, etc.
|
|
% Real reference is of course the code:
|
|
%
|
|
% Number = fun(Digit) -> [Digit, "+(_", Digit, "+)*"] end,
|
|
% DIGIT = "[0-9]",
|
|
% HEXDIGIT = "[0-9a-fA-F]",
|
|
% LOWER = "[a-z_]",
|
|
% UPPER = "[A-Z]",
|
|
% CON = [UPPER, "[a-zA-Z0-9_]*"],
|
|
% INT = Number(DIGIT),
|
|
% HEX = ["0x", Number(HEXDIGIT)],
|
|
% BYTES = ["#", Number(HEXDIGIT)],
|
|
% WS = "[\\000-\\ ]+",
|
|
% ID = [LOWER, "[a-zA-Z0-9_']*"],
|
|
% TVAR = ["'", ID],
|
|
% QID = ["(", CON, "\\.)+", ID],
|
|
% QCON = ["(", CON, "\\.)+", CON],
|
|
% OP = "[=!<>+\\-*/:&|?~@^]+",
|
|
% %% Five cases for a character
|
|
% %% * 1 7-bit ascii, not \ or '
|
|
% %% * 2-4 8-bit values (UTF8)
|
|
% %% * \ followed by a known modifier [aernrtv]
|
|
% %% * \xhh
|
|
% %% * \x{hhh...}
|
|
% CHAR = "'(([\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f])|([\\x00-\\xff][\\x80-\\xff]{1,3})|(\\\\[befnrtv'\\\\])|(\\\\x[0-9a-fA-F]{2,2})|(\\\\x\\{[0-9a-fA-F]*\\}))'",
|
|
% STRING = "\"([^\"\\\\]|(\\\\.))*\"",
|
|
%
|
|
% CommentStart = {"/\\*", push(comment, skip())},
|
|
% CommentRules =
|
|
% [ CommentStart
|
|
% , {"\\*/", pop(skip())}
|
|
% , {"[^/*]+|[/*]", skip()} ],
|
|
%
|
|
% Keywords = ["contract", "include", "let", "switch", "type", "record", "datatype", "if", "elif", "else", "function",
|
|
% "stateful", "payable", "true", "false", "mod", "public", "entrypoint", "private", "indexed", "namespace",
|
|
% "interface", "main", "using", "as", "for", "hiding", "band", "bor", "bxor", "bnot"
|
|
% ],
|
|
% KW = string:join(Keywords, "|"),
|
|
%
|
|
% There is a lot going on in that code. This is purely the part that matches
|
|
% strings specifically, . The *tokenizer* (gsc_tokenizer) knows the hierarchy
|
|
% of sophia tokens (e.g. it knows to match keywords before identifiers, so that
|
|
% `contract` gets tokenized as a keyword and not a variable name), and then
|
|
% calls into this module in order to match the string shape it's looking for.
|
|
% @end
|
|
-module(gs_strmatch).
|
|
|
|
%-compile([export_all, nowarn_export_all]).
|
|
|
|
|
|
-export_type([
|
|
string_matcher/0
|
|
]).
|
|
|
|
% given a string matcher and a string, determine match or no
|
|
-export([
|
|
match/2
|
|
]).
|
|
|
|
% string matchers for sophia token shapes
|
|
-export([
|
|
smr_sf_ws/0,
|
|
smr_sf_op/0,
|
|
smr_sf_punct/0,
|
|
smr_sf_id/0,
|
|
smr_sf_con/0,
|
|
smr_sf_qid/0,
|
|
smr_sf_qcon/0,
|
|
smr_sf_tvar/0,
|
|
smr_sf_int16/0,
|
|
smr_sf_int10/0,
|
|
smr_sf_bytes/0,
|
|
smr_sf_str/0,
|
|
smr_sf_char/0,
|
|
smr_sf_ak/0,
|
|
smr_sf_ct/0,
|
|
smr_sf_sg/0
|
|
]).
|
|
|
|
|
|
% regex primitives/combinators
|
|
-export([
|
|
% plumbing
|
|
smr_char/1,
|
|
smr_char_range/2,
|
|
smr_union/1,
|
|
smr_seq/1,
|
|
smr_plus/1,
|
|
smr_star/1,
|
|
smr_dot/0,
|
|
smr_ncmatch/2,
|
|
% porcelain
|
|
smr_string/1,
|
|
smr_oneofchars/1
|
|
]).
|
|
|
|
|
|
|
|
%%=======================================================================
|
|
%% API: Types
|
|
%%=======================================================================
|
|
|
|
-type string_matcher()
|
|
:: {smr_char, integer()} % /a/, /b/, /cd/
|
|
| {smr_char_range, integer(), integer()} % /[a-z]/
|
|
| {smr_union, [string_matcher()]} % /[abc]/
|
|
| {smr_seq, [string_matcher()]} % /abc/
|
|
| {smr_plus, string_matcher()} % /(abc)+/
|
|
| {smr_star, string_matcher()} % /(abc)*/
|
|
| smr_dot % /./
|
|
% negative conditional match
|
|
% /[^a-z]/, but more general
|
|
% /[^a-z]/ <~> smr_ncmatch(smr_char_range($a, $z), smr_dot()).
|
|
| {smr_ncmatch, MustNotMatch :: string_matcher(),
|
|
Match :: string_matcher()}.
|
|
|
|
|
|
%=========================================================
|
|
% API: Functions
|
|
%=========================================================
|
|
|
|
%---------------------------------------------------------
|
|
% API: string matching logic
|
|
%
|
|
% -export([
|
|
% match/2
|
|
% ]).
|
|
%---------------------------------------------------------
|
|
|
|
-spec match(Matcher, Source) -> MaybeMatch
|
|
when Matcher :: string_matcher(),
|
|
Source :: iolist(),
|
|
MaybeMatch :: {strmatch, Matched :: string(), Rest :: string()}
|
|
| no_strmatch.
|
|
% @doc
|
|
% normalize input to an nfc list before parsing
|
|
%
|
|
% match(Matcher, Source) ->
|
|
% string_match(Matcher, unicode:characters_to_nfc_list(Source)).
|
|
% @end
|
|
match(Matcher, Source) ->
|
|
string_match(Matcher, unicode:characters_to_nfc_list(Source)).
|
|
|
|
|
|
%---------------------------------------------------------
|
|
% API: string matchers for sophia tokens
|
|
%
|
|
% -export([
|
|
% smr_sf_ws/0,
|
|
% smr_sf_op/0,
|
|
% smr_sf_punct/0,
|
|
% smr_sf_id/0,
|
|
% smr_sf_con/0,
|
|
% smr_sf_qid/0,
|
|
% smr_sf_qcon/0,
|
|
% smr_sf_tvar/0,
|
|
% smr_sf_int16/0,
|
|
% smr_sf_int10/0,
|
|
% smr_sf_bytes/0,
|
|
% smr_sf_str/0,
|
|
% smr_sf_char/0,
|
|
% smr_sf_ak/0,
|
|
% smr_sf_ct/0,
|
|
% smr_sf_sg/0
|
|
% ]).
|
|
%---------------------------------------------------------
|
|
|
|
-spec smr_sf_ws() -> string_matcher().
|
|
% @doc
|
|
% String matcher for whitespace
|
|
%
|
|
% from so_scan.erl (9.0.0)
|
|
%
|
|
% WS = "[\\000-\\ ]+",
|
|
%
|
|
% turns out all the ascii codepoints which are 32 or lower are control chars or
|
|
% whitespace: https://www.asciitable.com/
|
|
% @end
|
|
smr_sf_ws() ->
|
|
WhitespaceChars = lists:seq(0, 32),
|
|
smr_plus(smr_oneofchars(WhitespaceChars)).
|
|
|
|
|
|
|
|
-spec smr_sf_op() -> string_matcher().
|
|
% @doc
|
|
% String matcher for a sophia operator
|
|
%
|
|
% from so_scan.erl (9.0.0)
|
|
%
|
|
% OP = "[=!<>+\\-*/:&|?~@^]+",
|
|
% @end
|
|
smr_sf_op() ->
|
|
SfOpChars = "=!<>+-*/:&|?~@^",
|
|
SfOpChar = smr_union([smr_char(C) || C <- SfOpChars]),
|
|
smr_plus(SfOpChar).
|
|
|
|
|
|
|
|
-spec smr_sf_punct() -> string_matcher().
|
|
% @doc
|
|
% String matcher for parens/braces
|
|
%
|
|
% from so_scan.erl (9.0.0)
|
|
%
|
|
% , {"\\.\\.|[,.;()\\[\\]{}]", symbol()}
|
|
% @end
|
|
|
|
smr_sf_punct() ->
|
|
M_DotDotOp = smr_string(".."),
|
|
M_PunctChars = smr_oneofchars(",.;()[]{}"),
|
|
smr_union([M_DotDotOp, M_PunctChars]).
|
|
|
|
|
|
|
|
-spec smr_sf_id() -> string_matcher().
|
|
% @doc
|
|
% String matcher for a sophia identifier
|
|
%
|
|
% foo
|
|
% _foo
|
|
% fooBar'
|
|
%
|
|
% - Id = [a-z_][A-Za-z0-9_']* identifiers start with a lower case letter.
|
|
% @end
|
|
|
|
smr_sf_id() ->
|
|
% upper lower digit under quote
|
|
ULDUQ =
|
|
{smr_union, [{smr_char_range, $A, $Z},
|
|
{smr_char_range, $a, $z},
|
|
{smr_char_range, $0, $9},
|
|
{smr_char, $_},
|
|
{smr_char, $'}]},
|
|
smr_seq([smr_union([smr_char_range($a, $z), smr_char($_)]),
|
|
smr_star(ULDUQ)]).
|
|
|
|
|
|
|
|
-spec smr_sf_con() -> string_matcher().
|
|
% @doc
|
|
% String matcher for a sophia constructor name
|
|
%
|
|
% Foo
|
|
% Foo_Bar
|
|
% Foo_Bar3_'
|
|
%
|
|
% - Con = [A-Z][A-Za-z0-9_']* constructors start with an upper case letter.
|
|
% @end
|
|
|
|
smr_sf_con() ->
|
|
ULDU =
|
|
{smr_union, [{smr_char_range, $A, $Z},
|
|
{smr_char_range, $a, $z},
|
|
{smr_char_range, $0, $9},
|
|
{smr_char, $_}]},
|
|
smr_seq([smr_char_range($A, $Z),
|
|
smr_star(ULDU)]).
|
|
|
|
|
|
|
|
-spec smr_sf_qid() -> string_matcher().
|
|
|
|
% @doc
|
|
% String matcher for a Sophia qualified identifier
|
|
%
|
|
% Foo.Bar.Baz.quux
|
|
%
|
|
% - QId = (Con\.)+Id qualified identifiers (e.g. `Map.member`)
|
|
% @end
|
|
smr_sf_qid() ->
|
|
Qualifier = smr_seq([smr_sf_con(), smr_char($.)]),
|
|
Qualifiers = smr_plus(Qualifier),
|
|
Identifier = smr_sf_id(),
|
|
smr_seq([Qualifiers, Identifier]).
|
|
|
|
|
|
|
|
-spec smr_sf_qcon() -> string_matcher().
|
|
|
|
% @doc
|
|
%
|
|
% String matcher for a sophia qualified constructor
|
|
%
|
|
% Foo.Bar.Baz
|
|
%
|
|
% - QCon = (Con\.)+Con qualified constructor
|
|
% @end
|
|
smr_sf_qcon() ->
|
|
Qualifier = smr_seq([smr_sf_con(), smr_char($.)]),
|
|
Qualifiers = smr_plus(Qualifier),
|
|
Constructor = smr_sf_con(),
|
|
smr_seq([Qualifiers, Constructor]).
|
|
|
|
|
|
|
|
-spec smr_sf_tvar() -> string_matcher().
|
|
|
|
% @doc
|
|
% String matcher for a sophia type variable; e.g.
|
|
%
|
|
% 'a
|
|
% 'foo_bar
|
|
%
|
|
% - TVar = 'Id type variable (e.g `'a`, `'b`)
|
|
% @end
|
|
smr_sf_tvar() ->
|
|
smr_seq([smr_char($'), smr_sf_id()]).
|
|
|
|
|
|
|
|
-spec smr_sf_int16() -> string_matcher().
|
|
|
|
% @doc
|
|
% String matcher for a sophia base16 integer 0xDEAD_BEEF
|
|
%
|
|
% so_scan parses base10/base16 in one go, but i think it's clearer if they're
|
|
% different
|
|
%
|
|
% - Int = [0-9]+(_[0-9]+)*|0x[0-9A-Fa-f]+(_[0-9A-Fa-f]+)* integer literal with optional `_` separators
|
|
% @end
|
|
smr_sf_int16() ->
|
|
HexDigit = smr_union([smr_char_range($0, $9),
|
|
smr_char_range($A, $F),
|
|
smr_char_range($a, $f)]),
|
|
HexDigits = smr_plus(HexDigit),
|
|
UHexDigits = smr_seq([smr_char($_), HexDigits]),
|
|
smr_seq([smr_string("0x"), HexDigits, smr_star(UHexDigits)]).
|
|
|
|
|
|
|
|
-spec smr_sf_int10() -> string_matcher().
|
|
|
|
% @doc
|
|
% string matcher for a sophia base 10 int 012_345_6_7
|
|
%
|
|
% so_scan parses base10/base16 in one go, but i think it's clearer if they're
|
|
% different
|
|
%
|
|
% - Int = [0-9]+(_[0-9]+)*|0x[0-9A-Fa-f]+(_[0-9A-Fa-f]+)* integer literal with optional `_` separators
|
|
% @end
|
|
smr_sf_int10() ->
|
|
DecDigit = smr_char_range($0, $9),
|
|
DecDigits = smr_plus(DecDigit),
|
|
UDecDigits = smr_seq([smr_char($_), DecDigits]),
|
|
smr_seq([DecDigits, smr_star(UDecDigits)]).
|
|
|
|
|
|
|
|
-spec smr_sf_bytes() -> string_matcher().
|
|
|
|
% @doc
|
|
% String matcher for a sophia bytestring
|
|
%
|
|
% #DEAD_BEEF
|
|
%
|
|
% - Bytes = #[0-9A-Fa-f]+(_[0-9A-Fa-f]+)* byte array literal with optional `_` separators
|
|
% @end
|
|
smr_sf_bytes() ->
|
|
HexDigit = smr_union([smr_char_range($0, $9),
|
|
smr_char_range($A, $F),
|
|
smr_char_range($a, $f)]),
|
|
HexDigits = smr_plus(HexDigit),
|
|
UHexDigits = smr_seq([smr_char($_), HexDigits]),
|
|
smr_seq([smr_char($#), HexDigits, smr_star(UHexDigits)]).
|
|
|
|
|
|
|
|
-spec smr_sf_str() -> string_matcher().
|
|
|
|
% @doc
|
|
% String matcher for sophia string literal
|
|
%
|
|
% String string literal enclosed in " with escape character `\`
|
|
%
|
|
% STRING = "\"([^\"\\\\]|(\\\\.))*\"",
|
|
% @end
|
|
smr_sf_str() ->
|
|
smr_seq([smr_char($"), smr_star(smr_sf_strchar()), smr_char($")]).
|
|
|
|
|
|
-spec smr_sf_strchar() -> string_matcher().
|
|
% @private
|
|
% string matcher for a character in a sophia string
|
|
%
|
|
% STRING = "\"([^\"\\\\]|(\\\\.))*\"",
|
|
%
|
|
% this is for
|
|
%
|
|
% ([^\"\\\\]|(\\\\.))
|
|
%
|
|
% cleaned up:
|
|
%
|
|
% ([^"\\]|(\\.))
|
|
% @end
|
|
smr_sf_strchar() ->
|
|
% cannot have a literal newline in string
|
|
%
|
|
% "foo
|
|
% bar"
|
|
%
|
|
% is not a valid sophia string
|
|
AnythingButNewline = smr_ncmatch(smr_char($\n), smr_dot()),
|
|
IsASpecialChar = smr_union([smr_char($"), smr_char($\\)]),
|
|
NotEscSeq = smr_ncmatch(IsASpecialChar, AnythingButNewline),
|
|
% FIXME: maybe we should enfore escape sequence rules here?
|
|
%
|
|
% especially to be consistent with char rules
|
|
EscSeq = smr_seq([smr_char($\\), AnythingButNewline]),
|
|
smr_union([NotEscSeq, EscSeq]).
|
|
|
|
|
|
|
|
-spec smr_sf_char() -> string_matcher().
|
|
% @doc
|
|
% String matcher for a Sophia char literal
|
|
%
|
|
% From so_scan.erl:
|
|
%
|
|
% %% Five cases for a character
|
|
% %% * 1 7-bit ascii, not \ or '
|
|
% %% * 2-4 8-bit values (UTF8)
|
|
% %% * \ followed by a known modifier [aernrtv]
|
|
% %% * \xhh
|
|
% %% * \x{hhh...}
|
|
% CHAR = "'(([\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f])|([\\x00-\\xff][\\x80-\\xff]{1,3})|(\\\\[befnrtv'\\\\])|(\\\\x[0-9a-fA-F]{2,2})|(\\\\x\\{[0-9a-fA-F]*\\}))'",
|
|
%
|
|
% > Char character literal enclosed in ' with escape character `\`
|
|
% @end
|
|
|
|
% ok we get this monstrosity
|
|
%
|
|
% "'(([\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f])|([\\x00-\\xff][\\x80-\\xff]{1,3})|(\\\\[befnrtv'\\\\])|(\\\\x[0-9a-fA-F]{2,2})|(\\\\x\\{[0-9a-fA-F]*\\}))'"
|
|
%
|
|
% there's like 4 levels of escaping and shit, so let's break it down. First
|
|
% let's notice this pattern:
|
|
%
|
|
% '(...)'.
|
|
%
|
|
% So let's make a hole
|
|
smr_sf_char() ->
|
|
smr_seq([smr_char($'), smr_sf_char_inner(), smr_char($')]).
|
|
|
|
% smr_sf_char_inner() will deal with the stuff in the monstrosity
|
|
%
|
|
% we had this before
|
|
% "'(([\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f])|([\\x00-\\xff][\\x80-\\xff]{1,3})|(\\\\[befnrtv'\\\\])|(\\\\x[0-9a-fA-F]{2,2})|(\\\\x\\{[0-9a-fA-F]*\\}))'"
|
|
%
|
|
% let's trim
|
|
% ([\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f])|([\\x00-\\xff][\\x80-\\xff]{1,3})|(\\\\[befnrtv'\\\\])|(\\\\x[0-9a-fA-F]{2,2})|(\\\\x\\{[0-9a-fA-F]*\\})
|
|
%
|
|
% and reorg
|
|
% ([\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f])
|
|
% | ([\\x00-\\xff][\\x80-\\xff]{1,3})
|
|
% | (\\\\[befnrtv'\\\\])
|
|
% | (\\\\x[0-9a-fA-F]{2,2})
|
|
% | (\\\\x\\{[0-9a-fA-F]*\\})
|
|
%
|
|
% trim some more
|
|
% [\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f]
|
|
% | [\\x00-\\xff][\\x80-\\xff]{1,3}
|
|
% | \\\\[befnrtv'\\\\]
|
|
% | \\\\x[0-9a-fA-F]{2,2}
|
|
% | \\\\x\\{[0-9a-fA-F]*\\}
|
|
%
|
|
% undo some escapes
|
|
% [\x00-\x26\x28-\x5b\x5d-\x7f]
|
|
% | [\x00-\xff][\x80-\xff]{1,3}
|
|
% | \\[befnrtv'\\]
|
|
% | \\x[0-9a-fA-F]{2,2}
|
|
% | \\x\{[0-9a-fA-F]*\}
|
|
%
|
|
% rewrite
|
|
% [^'\] <~> (16#00..16#26 | 16#28..16#5b | 16#5d..16#7f)
|
|
% <<_:8, (_ >= 128){1,3}>> <~> [\x00-\xff][\x80-\xff]{1,3}
|
|
% <<$\\, X>> <~> \\[befnrtv'\\]
|
|
% \xAB <~> \\x[0-9a-fA-F]{2,2}
|
|
% \x{DEADBEEF} <~> \\x\{[0-9a-fA-F]*\}
|
|
|
|
smr_sf_char_inner() ->
|
|
Escapable = smr_oneofchars("befnrtv'\\"),
|
|
EscSeq = smr_seq([smr_char($\\), Escapable]),
|
|
HexChar = smr_oneofchars("0123456789ABCDEFabcdef"),
|
|
HexEsc2 = smr_seq([smr_string("\\x"), HexChar, HexChar]),
|
|
HexEsc = smr_seq([smr_string("\\x{"), smr_star(HexChar), smr_char($})]),
|
|
% FIXME: possible erroneous oversimplification here
|
|
QuoteOrBackslash = smr_oneofchars([$', $\\]),
|
|
Utf8Char = smr_ncmatch(QuoteOrBackslash, smr_dot()),
|
|
smr_union([EscSeq, HexEsc2, HexEsc, Utf8Char]).
|
|
|
|
|
|
|
|
-spec smr_sf_ak() -> string_matcher().
|
|
% @doc
|
|
% string matcher for
|
|
%
|
|
% ak_....
|
|
%
|
|
% sophia's tokenizer tokenizes ak_.../sg_... etc as identifiers and then in the
|
|
% parsing stage disambiguates them
|
|
%
|
|
% i don't like that, but for version 0.1 we're going to match the behavior of
|
|
% `so_scan` exactly, just for clarity
|
|
%
|
|
% however, note that is the token step, we can still write a string matcher to
|
|
% be useful later
|
|
%
|
|
% > AccountAddress base58-encoded 32 byte account pubkey with `ak_` prefix
|
|
% @end
|
|
smr_sf_ak() ->
|
|
smr_apistr58("ak").
|
|
|
|
|
|
|
|
-spec smr_sf_ct() -> string_matcher().
|
|
% @doc
|
|
% string matcher for
|
|
%
|
|
% ct_....
|
|
%
|
|
% sophia's tokenizer tokenizes ak_.../sg_... etc as identifiers and then in the
|
|
% parsing stage disambiguates them
|
|
%
|
|
% i don't like that, but for version 0.1 we're going to match the behavior of
|
|
% `so_scan` exactly, just for clarity
|
|
%
|
|
% however, note that is the token step, we can still write a string matcher to
|
|
% be useful later
|
|
%
|
|
% > ContractAddress base58-encoded 32 byte contract address with `ct_` prefix
|
|
% @end
|
|
smr_sf_ct() ->
|
|
smr_apistr58("ct").
|
|
|
|
|
|
|
|
-spec smr_sf_sg() -> string_matcher().
|
|
% @doc
|
|
% string matcher for
|
|
%
|
|
% sg_....
|
|
%
|
|
% sophia's tokenizer tokenizes ak_.../sg_... etc as identifiers and then in the
|
|
% parsing stage disambiguates them
|
|
%
|
|
% i don't like that, but for version 0.1 we're going to match the behavior of
|
|
% `so_scan` exactly, just for clarity
|
|
%
|
|
% however, note that is the token step, we can still write a string matcher to
|
|
% be useful later
|
|
%
|
|
% > Signature base58-encoded 64 byte cryptographic signature with `sg_` prefix
|
|
% @end
|
|
smr_sf_sg() ->
|
|
smr_apistr58("sg").
|
|
|
|
|
|
-spec smr_apistr58(Prefix) -> string_matcher()
|
|
when Prefix :: string().
|
|
% @private
|
|
% string matcher for
|
|
%
|
|
% ak_...
|
|
% ct_...
|
|
% sg_...
|
|
%
|
|
% prefix is given as arg
|
|
%
|
|
% ... are base58 chars
|
|
% @end
|
|
smr_apistr58(Prefix) ->
|
|
smr_seq([smr_string(Prefix), smr_char($_), smr_plus(smr_base58char())]).
|
|
|
|
|
|
smr_base58char() ->
|
|
smr_oneofchars("123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz").
|
|
|
|
|
|
|
|
|
|
|
|
|
|
%---------------------------------------------------------
|
|
% API: string matcher primitive constructors
|
|
%---------------------------------------------------------
|
|
|
|
-spec smr_char(Char) -> string_matcher()
|
|
when Char :: integer().
|
|
% @doc
|
|
% string matcher for a specific char
|
|
%
|
|
% /[abc]/ <~> smr_union([smr_char($a), smr_char($b), smr_char($c)])
|
|
%
|
|
% @end
|
|
smr_char(X) when is_integer(X) ->
|
|
{smr_char, X}.
|
|
|
|
|
|
|
|
-spec smr_char_range(LowerBound, UpperBound) -> string_matcher()
|
|
when LowerBound :: integer(),
|
|
UpperBound :: integer().
|
|
% @doc
|
|
% string matcher for a range of characters
|
|
%
|
|
% /[a-z]/ <~> smr_char_range($a, $z)
|
|
% /[0-9]/ <~> smr_char_range($0, $9)
|
|
% @end
|
|
smr_char_range(X, Y) when is_integer(X), is_integer(Y) ->
|
|
{smr_char_range, X, Y}.
|
|
|
|
|
|
|
|
-spec smr_union(StringMatchers) -> string_matcher()
|
|
when StringMatchers :: [StringMatcher],
|
|
StringMatcher :: string_matcher().
|
|
% @doc
|
|
% String matcher that matches on the first matcher given that matches
|
|
%
|
|
% /[abc]/ <~> smr_union([smr_char($a), smr_char($b), smr_char($c)])
|
|
% /(foo|bar)/ <~> smr_union([smr_string("foo"), smr_string("bar")])
|
|
% @end
|
|
smr_union(List) when is_list(List) ->
|
|
{smr_union, List}.
|
|
|
|
|
|
|
|
-spec smr_seq(StringMatchers) -> string_matcher()
|
|
when StringMatchers :: [string_matcher()].
|
|
% @doc
|
|
% Match a sequence of matchers
|
|
%
|
|
% /abc/ <~> smr_seq([smr_char($a), smr_char($b), smr_char($c)])
|
|
%
|
|
% smr_string/1 just maps to a sequence of chars
|
|
% @end
|
|
smr_seq(List) when is_list(List) ->
|
|
{smr_seq, List}.
|
|
|
|
|
|
|
|
-spec smr_plus(Matcher) -> string_matcher()
|
|
when Matcher :: string_matcher().
|
|
% @doc
|
|
% "one or more of"; like the `+` operator in regexes.
|
|
%
|
|
% sm_plus(SMR, Src0) ->
|
|
% case string_match(SMR, Src0) of
|
|
% {strmatch, Str, Src1} -> sm_star(SMR, Str, Src1);
|
|
% no_strmatch -> no_strmatch
|
|
% end.
|
|
% @end
|
|
smr_plus(SMR) ->
|
|
{smr_plus, SMR}.
|
|
|
|
|
|
|
|
-spec smr_star(Matcher) -> string_matcher()
|
|
when Matcher :: string_matcher().
|
|
% @doc
|
|
% "zero or more of"; like the `*` operator in regexes.
|
|
%
|
|
% sm_star(SMR, Acc, Src0) ->
|
|
% case string_match(SMR, Src0) of
|
|
% % 0
|
|
% no_strmatch ->
|
|
% {strmatch, unicode:characters_to_list(Acc), Src0};
|
|
% % or more
|
|
% {strmatch, Str, Src1} ->
|
|
% sm_star(SMR, [Acc, Str], Src1)
|
|
% end.
|
|
% @end
|
|
smr_star(SMR) ->
|
|
{smr_star, SMR}.
|
|
|
|
|
|
|
|
-spec smr_dot() -> string_matcher().
|
|
% @doc
|
|
% matches every character; analogous to /./
|
|
%
|
|
% string_match(smr_dot, SrcStr) ->
|
|
% case SrcStr of
|
|
% [C | Rest] -> {strmatch, [C], Rest};
|
|
% [] -> no_strmatch
|
|
% end;
|
|
% @end
|
|
smr_dot() ->
|
|
smr_dot.
|
|
|
|
|
|
|
|
-spec smr_ncmatch(MustNotMatch, Match) -> string_matcher()
|
|
when MustNotMatch :: string_matcher(),
|
|
Match :: string_matcher().
|
|
% @doc
|
|
% Negative conditional match; analogous to `[^abc]` but more flexible
|
|
%
|
|
%
|
|
% /[^abc]/ <-> smr_ncmatch(smr_union([smr_char($a), smr_char($b), smr_char($c)]),
|
|
% smr_dot()).
|
|
%
|
|
%
|
|
% string_match({smr_ncmatch, MustNotMatch, Match}, SrcStr) ->
|
|
% case string_match(MustNotMatch, SrcStr) of
|
|
% no_strmatch -> string_match(Match, SrcStr);
|
|
% _ -> no_strmatch
|
|
% end.
|
|
%
|
|
% @end
|
|
smr_ncmatch(A, B) ->
|
|
{smr_ncmatch, A, B}.
|
|
|
|
|
|
|
|
%---------------------------------------------------------
|
|
% string matcher helpers
|
|
%---------------------------------------------------------
|
|
|
|
-spec smr_string(Chars) -> string_matcher()
|
|
when Chars :: string().
|
|
% @doc
|
|
% matches chars given in sequence; basically like putting the string in raw in
|
|
% a regex
|
|
%
|
|
% /foo/ <~> smr_string("foo")
|
|
% <~> smr_seq([smr_char($f), smr_char($o), smr_char($o)])
|
|
%
|
|
% rewrite over smr_seq/1 and smr_char/1
|
|
%
|
|
% smr_string(String) when is_list(String) ->
|
|
% smr_seq([smr_char(C) || C <- String]).
|
|
% @end
|
|
smr_string(String) when is_list(String) ->
|
|
smr_seq([smr_char(C) || C <- String]).
|
|
|
|
|
|
|
|
-spec smr_oneofchars(Chars) -> UnionMatcher
|
|
when Chars :: string(),
|
|
UnionMatcher :: string_matcher().
|
|
% @doc
|
|
% String matcher for one of chars
|
|
%
|
|
% /[abc]/ <~> smr_costring("abc")
|
|
% <~> smr_union([smr_char($f), smr_char($o), smr_char($o)])
|
|
%
|
|
% this is the dual of smr_string/1. string puts chars in sequence, this puts
|
|
% chars in parallel.
|
|
%
|
|
% "costring" nomenclature is chosen specifically to annoy craig
|
|
%
|
|
% if you fix your stupid url schema i will consider changing this name
|
|
%
|
|
% the thing is though this is actually a good name, your url schema is just...
|
|
% well you know it's compact, so you have amazon beat. no page-long urls for
|
|
% gajumarket
|
|
%
|
|
% you know what, we're keeping both names
|
|
%
|
|
% i'm confusing myself, renaming to "oneofchars"
|
|
% @end
|
|
smr_oneofchars(Chars) ->
|
|
smr_union([smr_char(C) || C <- Chars]).
|
|
|
|
|
|
%%=======================================================================
|
|
%% INTERNALS: string matching logic
|
|
%%=======================================================================
|
|
|
|
|
|
-spec string_match(Matcher, Source) -> MaybeMatch
|
|
when Matcher :: string_matcher(),
|
|
Source :: string(),
|
|
MaybeMatch :: {strmatch, Matched :: string(), Rest :: string()}
|
|
| no_strmatch.
|
|
% @private
|
|
% See if the source matches the given matcher; returns
|
|
%
|
|
% %% NOTIONAL code
|
|
% string_match(/[abc]/, "abc") ->
|
|
% {strmatch, "a", "bc"}
|
|
% string_match(/[abc]/, "def") ->
|
|
% no_strmatch
|
|
% @end
|
|
string_match({smr_char, C}, SrcStr) ->
|
|
case SrcStr of
|
|
[X | Rest] when X =:= C -> {strmatch, [C], Rest};
|
|
_ -> no_strmatch
|
|
end;
|
|
string_match({smr_char_range, X, Y}, Src0) ->
|
|
case Src0 of
|
|
[C | Src1] when X =< C, C =< Y -> {strmatch, [C], Src1};
|
|
_ -> no_strmatch
|
|
end;
|
|
string_match({smr_union, SMRs}, Src0) ->
|
|
sm_union(SMRs, Src0);
|
|
string_match({smr_seq, SMRs}, Src0) ->
|
|
sm_seq(SMRs, [], Src0);
|
|
string_match({smr_plus, SMR}, Src0) ->
|
|
sm_plus(SMR, Src0);
|
|
string_match({smr_star, SMR}, Src0) ->
|
|
sm_star(SMR, [], Src0);
|
|
string_match(smr_dot, SrcStr) ->
|
|
case SrcStr of
|
|
[C | Rest] -> {strmatch, [C], Rest};
|
|
[] -> no_strmatch
|
|
end;
|
|
string_match({smr_ncmatch, MustNotMatch, Match}, SrcStr) ->
|
|
case string_match(MustNotMatch, SrcStr) of
|
|
no_strmatch -> string_match(Match, SrcStr);
|
|
_ -> no_strmatch
|
|
end.
|
|
|
|
|
|
% @private union must match *one* thing
|
|
sm_union([SMR | SMRs], Src0) ->
|
|
case string_match(SMR, Src0) of
|
|
no_strmatch -> sm_union(SMRs, Src0);
|
|
Match -> Match
|
|
end;
|
|
sm_union([], _) ->
|
|
no_strmatch.
|
|
|
|
|
|
% @private sequence must match *EACH* thing
|
|
sm_seq([SMR | SMRs], Acc, Src0) ->
|
|
case string_match(SMR, Src0) of
|
|
{strmatch, Str, Src1} -> sm_seq(SMRs, [Acc, Str], Src1);
|
|
no_strmatch -> no_strmatch
|
|
end;
|
|
sm_seq([], Acc, Src) ->
|
|
{strmatch, unicode:characters_to_list(Acc), Src}.
|
|
|
|
|
|
% @private plus matches at least one
|
|
sm_plus(SMR, Src0) ->
|
|
case string_match(SMR, Src0) of
|
|
{strmatch, Str, Src1} -> sm_star(SMR, Str, Src1);
|
|
no_strmatch -> no_strmatch
|
|
end.
|
|
|
|
|
|
% @private star matches 0 or more
|
|
sm_star(SMR, Acc, Src0) ->
|
|
case string_match(SMR, Src0) of
|
|
% 0
|
|
no_strmatch ->
|
|
{strmatch, unicode:characters_to_list(Acc), Src0};
|
|
% or more
|
|
{strmatch, Str, Src1} ->
|
|
sm_star(SMR, [Acc, Str], Src1)
|
|
end.
|
|
|