Files
gsc/src/gs_strmatch.erl
T
2026-06-02 01:48:05 -07:00

882 lines
24 KiB
Erlang

% @doc
% A string matcher is roughly analogous to a regex. It describes a pattern,
% which a string may or may not match.
%
% This module is essentially a pure erlang implementation of the subset of
% regular expressions that are needed to tokenize sophia.
%
% The intent for now (May 2026) is simply to perfectly mimic the so_scan library
%
% Reference is `docs/sophia_syntax.md` as well as `src/so_scan_lib.erl` in
% original sophia lib
%
% From docs/sophia_syntax.md:
%
% - Id = [a-z_][A-Za-z0-9_']* identifiers start with a lower case letter.
% - Con = [A-Z][A-Za-z0-9_]* constructors start with an upper case letter.
% - QId = (Con\.)+Id qualified identifiers (e.g. `Map.member`)
% - QCon = (Con\.)+Con qualified constructor
% - TVar = 'Id type variable (e.g `'a`, `'b`)
% - Int = [0-9]+(_[0-9]+)*|0x[0-9A-Fa-f]+(_[0-9A-Fa-f]+)* integer literal with optional `_` separators
% - Bytes = #[0-9A-Fa-f]+(_[0-9A-Fa-f]+)* byte array literal with optional `_` separators
% - String` string literal enclosed in " with escape character `\`
% - Char character literal enclosed in ' with escape character `\`
% - AccountAddress base58-encoded 32 byte account pubkey with `ak_` prefix
% - ContractAddress base58-encoded 32 byte contract address with `ct_` prefix
% - Signature base58-encoded 64 byte cryptographic signature with `sg_` prefix
%
% Sophia's notion of tokens also includes keywords, parens, whitespace, etc.
% Real reference is of course the code:
%
% Number = fun(Digit) -> [Digit, "+(_", Digit, "+)*"] end,
% DIGIT = "[0-9]",
% HEXDIGIT = "[0-9a-fA-F]",
% LOWER = "[a-z_]",
% UPPER = "[A-Z]",
% CON = [UPPER, "[a-zA-Z0-9_]*"],
% INT = Number(DIGIT),
% HEX = ["0x", Number(HEXDIGIT)],
% BYTES = ["#", Number(HEXDIGIT)],
% WS = "[\\000-\\ ]+",
% ID = [LOWER, "[a-zA-Z0-9_']*"],
% TVAR = ["'", ID],
% QID = ["(", CON, "\\.)+", ID],
% QCON = ["(", CON, "\\.)+", CON],
% OP = "[=!<>+\\-*/:&|?~@^]+",
% %% Five cases for a character
% %% * 1 7-bit ascii, not \ or '
% %% * 2-4 8-bit values (UTF8)
% %% * \ followed by a known modifier [aernrtv]
% %% * \xhh
% %% * \x{hhh...}
% CHAR = "'(([\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f])|([\\x00-\\xff][\\x80-\\xff]{1,3})|(\\\\[befnrtv'\\\\])|(\\\\x[0-9a-fA-F]{2,2})|(\\\\x\\{[0-9a-fA-F]*\\}))'",
% STRING = "\"([^\"\\\\]|(\\\\.))*\"",
%
% CommentStart = {"/\\*", push(comment, skip())},
% CommentRules =
% [ CommentStart
% , {"\\*/", pop(skip())}
% , {"[^/*]+|[/*]", skip()} ],
%
% Keywords = ["contract", "include", "let", "switch", "type", "record", "datatype", "if", "elif", "else", "function",
% "stateful", "payable", "true", "false", "mod", "public", "entrypoint", "private", "indexed", "namespace",
% "interface", "main", "using", "as", "for", "hiding", "band", "bor", "bxor", "bnot"
% ],
% KW = string:join(Keywords, "|"),
%
% There is a lot going on in that code. This is purely the part that matches
% strings specifically, . The *tokenizer* (gsc_tokenizer) knows the hierarchy
% of sophia tokens (e.g. it knows to match keywords before identifiers, so that
% `contract` gets tokenized as a keyword and not a variable name), and then
% calls into this module in order to match the string shape it's looking for.
% @end
-module(gs_strmatch).
%-compile([export_all, nowarn_export_all]).
-export_type([
string_matcher/0
]).
% given a string matcher and a string, determine match or no
-export([
match/2
]).
% string matchers for sophia token shapes
-export([
smr_sf_ws/0,
smr_sf_op/0,
smr_sf_punct/0,
smr_sf_id/0,
smr_sf_con/0,
smr_sf_qid/0,
smr_sf_qcon/0,
smr_sf_tvar/0,
smr_sf_int16/0,
smr_sf_int10/0,
smr_sf_bytes/0,
smr_sf_str/0,
smr_sf_char/0,
smr_sf_ak/0,
smr_sf_ct/0,
smr_sf_sg/0
]).
% regex primitives/combinators
-export([
% plumbing
smr_char/1,
smr_char_range/2,
smr_union/1,
smr_seq/1,
smr_plus/1,
smr_star/1,
smr_dot/0,
smr_ncmatch/2,
% porcelain
smr_string/1,
smr_oneofchars/1
]).
%%=======================================================================
%% API: Types
%%=======================================================================
-type string_matcher()
:: {smr_char, integer()} % /a/, /b/, /cd/
| {smr_char_range, integer(), integer()} % /[a-z]/
| {smr_union, [string_matcher()]} % /[abc]/
| {smr_seq, [string_matcher()]} % /abc/
| {smr_plus, string_matcher()} % /(abc)+/
| {smr_star, string_matcher()} % /(abc)*/
| smr_dot % /./
% negative conditional match
% /[^a-z]/, but more general
% /[^a-z]/ <~> smr_ncmatch(smr_char_range($a, $z), smr_dot()).
| {smr_ncmatch, MustNotMatch :: string_matcher(),
Match :: string_matcher()}.
%=========================================================
% API: Functions
%=========================================================
%---------------------------------------------------------
% API: string matching logic
%
% -export([
% match/2
% ]).
%---------------------------------------------------------
-spec match(Matcher, Source) -> MaybeMatch
when Matcher :: string_matcher(),
Source :: iolist(),
MaybeMatch :: {strmatch, Matched :: string(), Rest :: string()}
| no_strmatch.
% @doc
% normalize input to an nfc list before parsing
%
% match(Matcher, Source) ->
% string_match(Matcher, unicode:characters_to_nfc_list(Source)).
% @end
match(Matcher, Source) ->
string_match(Matcher, unicode:characters_to_nfc_list(Source)).
%---------------------------------------------------------
% API: string matchers for sophia tokens
%
% -export([
% smr_sf_ws/0,
% smr_sf_op/0,
% smr_sf_punct/0,
% smr_sf_id/0,
% smr_sf_con/0,
% smr_sf_qid/0,
% smr_sf_qcon/0,
% smr_sf_tvar/0,
% smr_sf_int16/0,
% smr_sf_int10/0,
% smr_sf_bytes/0,
% smr_sf_str/0,
% smr_sf_char/0,
% smr_sf_ak/0,
% smr_sf_ct/0,
% smr_sf_sg/0
% ]).
%---------------------------------------------------------
-spec smr_sf_ws() -> string_matcher().
% @doc
% String matcher for whitespace
%
% from so_scan.erl (9.0.0)
%
% WS = "[\\000-\\ ]+",
%
% turns out all the ascii codepoints which are 32 or lower are control chars or
% whitespace: https://www.asciitable.com/
% @end
smr_sf_ws() ->
WhitespaceChars = lists:seq(0, 32),
smr_plus(smr_oneofchars(WhitespaceChars)).
-spec smr_sf_op() -> string_matcher().
% @doc
% String matcher for a sophia operator
%
% from so_scan.erl (9.0.0)
%
% OP = "[=!<>+\\-*/:&|?~@^]+",
% @end
smr_sf_op() ->
SfOpChars = "=!<>+-*/:&|?~@^",
SfOpChar = smr_union([smr_char(C) || C <- SfOpChars]),
smr_plus(SfOpChar).
-spec smr_sf_punct() -> string_matcher().
% @doc
% String matcher for parens/braces
%
% from so_scan.erl (9.0.0)
%
% , {"\\.\\.|[,.;()\\[\\]{}]", symbol()}
% @end
smr_sf_punct() ->
M_DotDotOp = smr_string(".."),
M_PunctChars = smr_oneofchars(",.;()[]{}"),
smr_union([M_DotDotOp, M_PunctChars]).
-spec smr_sf_id() -> string_matcher().
% @doc
% String matcher for a sophia identifier
%
% foo
% _foo
% fooBar'
%
% - Id = [a-z_][A-Za-z0-9_']* identifiers start with a lower case letter.
% @end
smr_sf_id() ->
% upper lower digit under quote
ULDUQ =
{smr_union, [{smr_char_range, $A, $Z},
{smr_char_range, $a, $z},
{smr_char_range, $0, $9},
{smr_char, $_},
{smr_char, $'}]},
smr_seq([smr_union([smr_char_range($a, $z), smr_char($_)]),
smr_star(ULDUQ)]).
-spec smr_sf_con() -> string_matcher().
% @doc
% String matcher for a sophia constructor name
%
% Foo
% Foo_Bar
% Foo_Bar3_'
%
% - Con = [A-Z][A-Za-z0-9_']* constructors start with an upper case letter.
% @end
smr_sf_con() ->
ULDU =
{smr_union, [{smr_char_range, $A, $Z},
{smr_char_range, $a, $z},
{smr_char_range, $0, $9},
{smr_char, $_}]},
smr_seq([smr_char_range($A, $Z),
smr_star(ULDU)]).
-spec smr_sf_qid() -> string_matcher().
% @doc
% String matcher for a Sophia qualified identifier
%
% Foo.Bar.Baz.quux
%
% - QId = (Con\.)+Id qualified identifiers (e.g. `Map.member`)
% @end
smr_sf_qid() ->
Qualifier = smr_seq([smr_sf_con(), smr_char($.)]),
Qualifiers = smr_plus(Qualifier),
Identifier = smr_sf_id(),
smr_seq([Qualifiers, Identifier]).
-spec smr_sf_qcon() -> string_matcher().
% @doc
%
% String matcher for a sophia qualified constructor
%
% Foo.Bar.Baz
%
% - QCon = (Con\.)+Con qualified constructor
% @end
smr_sf_qcon() ->
Qualifier = smr_seq([smr_sf_con(), smr_char($.)]),
Qualifiers = smr_plus(Qualifier),
Constructor = smr_sf_con(),
smr_seq([Qualifiers, Constructor]).
-spec smr_sf_tvar() -> string_matcher().
% @doc
% String matcher for a sophia type variable; e.g.
%
% 'a
% 'foo_bar
%
% - TVar = 'Id type variable (e.g `'a`, `'b`)
% @end
smr_sf_tvar() ->
smr_seq([smr_char($'), smr_sf_id()]).
-spec smr_sf_int16() -> string_matcher().
% @doc
% String matcher for a sophia base16 integer 0xDEAD_BEEF
%
% so_scan parses base10/base16 in one go, but i think it's clearer if they're
% different
%
% - Int = [0-9]+(_[0-9]+)*|0x[0-9A-Fa-f]+(_[0-9A-Fa-f]+)* integer literal with optional `_` separators
% @end
smr_sf_int16() ->
HexDigit = smr_union([smr_char_range($0, $9),
smr_char_range($A, $F),
smr_char_range($a, $f)]),
HexDigits = smr_plus(HexDigit),
UHexDigits = smr_seq([smr_char($_), HexDigits]),
smr_seq([smr_string("0x"), HexDigits, smr_star(UHexDigits)]).
-spec smr_sf_int10() -> string_matcher().
% @doc
% string matcher for a sophia base 10 int 012_345_6_7
%
% so_scan parses base10/base16 in one go, but i think it's clearer if they're
% different
%
% - Int = [0-9]+(_[0-9]+)*|0x[0-9A-Fa-f]+(_[0-9A-Fa-f]+)* integer literal with optional `_` separators
% @end
smr_sf_int10() ->
DecDigit = smr_char_range($0, $9),
DecDigits = smr_plus(DecDigit),
UDecDigits = smr_seq([smr_char($_), DecDigits]),
smr_seq([DecDigits, smr_star(UDecDigits)]).
-spec smr_sf_bytes() -> string_matcher().
% @doc
% String matcher for a sophia bytestring
%
% #DEAD_BEEF
%
% - Bytes = #[0-9A-Fa-f]+(_[0-9A-Fa-f]+)* byte array literal with optional `_` separators
% @end
smr_sf_bytes() ->
HexDigit = smr_union([smr_char_range($0, $9),
smr_char_range($A, $F),
smr_char_range($a, $f)]),
HexDigits = smr_plus(HexDigit),
UHexDigits = smr_seq([smr_char($_), HexDigits]),
smr_seq([smr_char($#), HexDigits, smr_star(UHexDigits)]).
-spec smr_sf_str() -> string_matcher().
% @doc
% String matcher for sophia string literal
%
% String string literal enclosed in " with escape character `\`
%
% STRING = "\"([^\"\\\\]|(\\\\.))*\"",
% @end
smr_sf_str() ->
smr_seq([smr_char($"), smr_star(smr_sf_strchar()), smr_char($")]).
-spec smr_sf_strchar() -> string_matcher().
% @private
% string matcher for a character in a sophia string
%
% STRING = "\"([^\"\\\\]|(\\\\.))*\"",
%
% this is for
%
% ([^\"\\\\]|(\\\\.))
%
% cleaned up:
%
% ([^"\\]|(\\.))
% @end
smr_sf_strchar() ->
% cannot have a literal newline in string
%
% "foo
% bar"
%
% is not a valid sophia string
AnythingButNewline = smr_ncmatch(smr_char($\n), smr_dot()),
IsASpecialChar = smr_union([smr_char($"), smr_char($\\)]),
NotEscSeq = smr_ncmatch(IsASpecialChar, AnythingButNewline),
% FIXME: maybe we should enfore escape sequence rules here?
%
% especially to be consistent with char rules
EscSeq = smr_seq([smr_char($\\), AnythingButNewline]),
smr_union([NotEscSeq, EscSeq]).
-spec smr_sf_char() -> string_matcher().
% @doc
% String matcher for a Sophia char literal
%
% From so_scan.erl:
%
% %% Five cases for a character
% %% * 1 7-bit ascii, not \ or '
% %% * 2-4 8-bit values (UTF8)
% %% * \ followed by a known modifier [aernrtv]
% %% * \xhh
% %% * \x{hhh...}
% CHAR = "'(([\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f])|([\\x00-\\xff][\\x80-\\xff]{1,3})|(\\\\[befnrtv'\\\\])|(\\\\x[0-9a-fA-F]{2,2})|(\\\\x\\{[0-9a-fA-F]*\\}))'",
%
% > Char character literal enclosed in ' with escape character `\`
% @end
% ok we get this monstrosity
%
% "'(([\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f])|([\\x00-\\xff][\\x80-\\xff]{1,3})|(\\\\[befnrtv'\\\\])|(\\\\x[0-9a-fA-F]{2,2})|(\\\\x\\{[0-9a-fA-F]*\\}))'"
%
% there's like 4 levels of escaping and shit, so let's break it down. First
% let's notice this pattern:
%
% '(...)'.
%
% So let's make a hole
smr_sf_char() ->
smr_seq([smr_char($'), smr_sf_char_inner(), smr_char($')]).
% smr_sf_char_inner() will deal with the stuff in the monstrosity
%
% we had this before
% "'(([\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f])|([\\x00-\\xff][\\x80-\\xff]{1,3})|(\\\\[befnrtv'\\\\])|(\\\\x[0-9a-fA-F]{2,2})|(\\\\x\\{[0-9a-fA-F]*\\}))'"
%
% let's trim
% ([\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f])|([\\x00-\\xff][\\x80-\\xff]{1,3})|(\\\\[befnrtv'\\\\])|(\\\\x[0-9a-fA-F]{2,2})|(\\\\x\\{[0-9a-fA-F]*\\})
%
% and reorg
% ([\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f])
% | ([\\x00-\\xff][\\x80-\\xff]{1,3})
% | (\\\\[befnrtv'\\\\])
% | (\\\\x[0-9a-fA-F]{2,2})
% | (\\\\x\\{[0-9a-fA-F]*\\})
%
% trim some more
% [\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f]
% | [\\x00-\\xff][\\x80-\\xff]{1,3}
% | \\\\[befnrtv'\\\\]
% | \\\\x[0-9a-fA-F]{2,2}
% | \\\\x\\{[0-9a-fA-F]*\\}
%
% undo some escapes
% [\x00-\x26\x28-\x5b\x5d-\x7f]
% | [\x00-\xff][\x80-\xff]{1,3}
% | \\[befnrtv'\\]
% | \\x[0-9a-fA-F]{2,2}
% | \\x\{[0-9a-fA-F]*\}
%
% rewrite
% [^'\] <~> (16#00..16#26 | 16#28..16#5b | 16#5d..16#7f)
% <<_:8, (_ >= 128){1,3}>> <~> [\x00-\xff][\x80-\xff]{1,3}
% <<$\\, X>> <~> \\[befnrtv'\\]
% \xAB <~> \\x[0-9a-fA-F]{2,2}
% \x{DEADBEEF} <~> \\x\{[0-9a-fA-F]*\}
smr_sf_char_inner() ->
Escapable = smr_oneofchars("befnrtv'\\"),
EscSeq = smr_seq([smr_char($\\), Escapable]),
HexChar = smr_oneofchars("0123456789ABCDEFabcdef"),
HexEsc2 = smr_seq([smr_string("\\x"), HexChar, HexChar]),
HexEsc = smr_seq([smr_string("\\x{"), smr_star(HexChar), smr_char($})]),
% FIXME: possible erroneous oversimplification here
QuoteOrBackslash = smr_oneofchars([$', $\\]),
Utf8Char = smr_ncmatch(QuoteOrBackslash, smr_dot()),
smr_union([EscSeq, HexEsc2, HexEsc, Utf8Char]).
-spec smr_sf_ak() -> string_matcher().
% @doc
% string matcher for
%
% ak_....
%
% sophia's tokenizer tokenizes ak_.../sg_... etc as identifiers and then in the
% parsing stage disambiguates them
%
% i don't like that, but for version 0.1 we're going to match the behavior of
% `so_scan` exactly, just for clarity
%
% however, note that is the token step, we can still write a string matcher to
% be useful later
%
% > AccountAddress base58-encoded 32 byte account pubkey with `ak_` prefix
% @end
smr_sf_ak() ->
smr_apistr58("ak").
-spec smr_sf_ct() -> string_matcher().
% @doc
% string matcher for
%
% ct_....
%
% sophia's tokenizer tokenizes ak_.../sg_... etc as identifiers and then in the
% parsing stage disambiguates them
%
% i don't like that, but for version 0.1 we're going to match the behavior of
% `so_scan` exactly, just for clarity
%
% however, note that is the token step, we can still write a string matcher to
% be useful later
%
% > ContractAddress base58-encoded 32 byte contract address with `ct_` prefix
% @end
smr_sf_ct() ->
smr_apistr58("ct").
-spec smr_sf_sg() -> string_matcher().
% @doc
% string matcher for
%
% sg_....
%
% sophia's tokenizer tokenizes ak_.../sg_... etc as identifiers and then in the
% parsing stage disambiguates them
%
% i don't like that, but for version 0.1 we're going to match the behavior of
% `so_scan` exactly, just for clarity
%
% however, note that is the token step, we can still write a string matcher to
% be useful later
%
% > Signature base58-encoded 64 byte cryptographic signature with `sg_` prefix
% @end
smr_sf_sg() ->
smr_apistr58("sg").
-spec smr_apistr58(Prefix) -> string_matcher()
when Prefix :: string().
% @private
% string matcher for
%
% ak_...
% ct_...
% sg_...
%
% prefix is given as arg
%
% ... are base58 chars
% @end
smr_apistr58(Prefix) ->
smr_seq([smr_string(Prefix), smr_char($_), smr_plus(smr_base58char())]).
smr_base58char() ->
smr_oneofchars("123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz").
%---------------------------------------------------------
% API: string matcher primitive constructors
%---------------------------------------------------------
-spec smr_char(Char) -> string_matcher()
when Char :: integer().
% @doc
% string matcher for a specific char
%
% /[abc]/ <~> smr_union([smr_char($a), smr_char($b), smr_char($c)])
%
% @end
smr_char(X) when is_integer(X) ->
{smr_char, X}.
-spec smr_char_range(LowerBound, UpperBound) -> string_matcher()
when LowerBound :: integer(),
UpperBound :: integer().
% @doc
% string matcher for a range of characters
%
% /[a-z]/ <~> smr_char_range($a, $z)
% /[0-9]/ <~> smr_char_range($0, $9)
% @end
smr_char_range(X, Y) when is_integer(X), is_integer(Y) ->
{smr_char_range, X, Y}.
-spec smr_union(StringMatchers) -> string_matcher()
when StringMatchers :: [StringMatcher],
StringMatcher :: string_matcher().
% @doc
% String matcher that matches on the first matcher given that matches
%
% /[abc]/ <~> smr_union([smr_char($a), smr_char($b), smr_char($c)])
% /(foo|bar)/ <~> smr_union([smr_string("foo"), smr_string("bar")])
% @end
smr_union(List) when is_list(List) ->
{smr_union, List}.
-spec smr_seq(StringMatchers) -> string_matcher()
when StringMatchers :: [string_matcher()].
% @doc
% Match a sequence of matchers
%
% /abc/ <~> smr_seq([smr_char($a), smr_char($b), smr_char($c)])
%
% smr_string/1 just maps to a sequence of chars
% @end
smr_seq(List) when is_list(List) ->
{smr_seq, List}.
-spec smr_plus(Matcher) -> string_matcher()
when Matcher :: string_matcher().
% @doc
% "one or more of"; like the `+` operator in regexes.
%
% sm_plus(SMR, Src0) ->
% case string_match(SMR, Src0) of
% {strmatch, Str, Src1} -> sm_star(SMR, Str, Src1);
% no_strmatch -> no_strmatch
% end.
% @end
smr_plus(SMR) ->
{smr_plus, SMR}.
-spec smr_star(Matcher) -> string_matcher()
when Matcher :: string_matcher().
% @doc
% "zero or more of"; like the `*` operator in regexes.
%
% sm_star(SMR, Acc, Src0) ->
% case string_match(SMR, Src0) of
% % 0
% no_strmatch ->
% {strmatch, unicode:characters_to_list(Acc), Src0};
% % or more
% {strmatch, Str, Src1} ->
% sm_star(SMR, [Acc, Str], Src1)
% end.
% @end
smr_star(SMR) ->
{smr_star, SMR}.
-spec smr_dot() -> string_matcher().
% @doc
% matches every character; analogous to /./
%
% string_match(smr_dot, SrcStr) ->
% case SrcStr of
% [C | Rest] -> {strmatch, [C], Rest};
% [] -> no_strmatch
% end;
% @end
smr_dot() ->
smr_dot.
-spec smr_ncmatch(MustNotMatch, Match) -> string_matcher()
when MustNotMatch :: string_matcher(),
Match :: string_matcher().
% @doc
% Negative conditional match; analogous to `[^abc]` but more flexible
%
%
% /[^abc]/ <-> smr_ncmatch(smr_union([smr_char($a), smr_char($b), smr_char($c)]),
% smr_dot()).
%
%
% string_match({smr_ncmatch, MustNotMatch, Match}, SrcStr) ->
% case string_match(MustNotMatch, SrcStr) of
% no_strmatch -> string_match(Match, SrcStr);
% _ -> no_strmatch
% end.
%
% @end
smr_ncmatch(A, B) ->
{smr_ncmatch, A, B}.
%---------------------------------------------------------
% string matcher helpers
%---------------------------------------------------------
-spec smr_string(Chars) -> string_matcher()
when Chars :: string().
% @doc
% matches chars given in sequence; basically like putting the string in raw in
% a regex
%
% /foo/ <~> smr_string("foo")
% <~> smr_seq([smr_char($f), smr_char($o), smr_char($o)])
%
% rewrite over smr_seq/1 and smr_char/1
%
% smr_string(String) when is_list(String) ->
% smr_seq([smr_char(C) || C <- String]).
% @end
smr_string(String) when is_list(String) ->
smr_seq([smr_char(C) || C <- String]).
-spec smr_oneofchars(Chars) -> UnionMatcher
when Chars :: string(),
UnionMatcher :: string_matcher().
% @doc
% String matcher for one of chars
%
% /[abc]/ <~> smr_costring("abc")
% <~> smr_union([smr_char($f), smr_char($o), smr_char($o)])
%
% this is the dual of smr_string/1. string puts chars in sequence, this puts
% chars in parallel.
%
% "costring" nomenclature is chosen specifically to annoy craig
%
% if you fix your stupid url schema i will consider changing this name
%
% the thing is though this is actually a good name, your url schema is just...
% well you know it's compact, so you have amazon beat. no page-long urls for
% gajumarket
%
% you know what, we're keeping both names
%
% i'm confusing myself, renaming to "oneofchars"
% @end
smr_oneofchars(Chars) ->
smr_union([smr_char(C) || C <- Chars]).
%%=======================================================================
%% INTERNALS: string matching logic
%%=======================================================================
-spec string_match(Matcher, Source) -> MaybeMatch
when Matcher :: string_matcher(),
Source :: string(),
MaybeMatch :: {strmatch, Matched :: string(), Rest :: string()}
| no_strmatch.
% @private
% See if the source matches the given matcher; returns
%
% %% NOTIONAL code
% string_match(/[abc]/, "abc") ->
% {strmatch, "a", "bc"}
% string_match(/[abc]/, "def") ->
% no_strmatch
% @end
string_match({smr_char, C}, SrcStr) ->
case SrcStr of
[X | Rest] when X =:= C -> {strmatch, [C], Rest};
_ -> no_strmatch
end;
string_match({smr_char_range, X, Y}, Src0) ->
case Src0 of
[C | Src1] when X =< C, C =< Y -> {strmatch, [C], Src1};
_ -> no_strmatch
end;
string_match({smr_union, SMRs}, Src0) ->
sm_union(SMRs, Src0);
string_match({smr_seq, SMRs}, Src0) ->
sm_seq(SMRs, [], Src0);
string_match({smr_plus, SMR}, Src0) ->
sm_plus(SMR, Src0);
string_match({smr_star, SMR}, Src0) ->
sm_star(SMR, [], Src0);
string_match(smr_dot, SrcStr) ->
case SrcStr of
[C | Rest] -> {strmatch, [C], Rest};
[] -> no_strmatch
end;
string_match({smr_ncmatch, MustNotMatch, Match}, SrcStr) ->
case string_match(MustNotMatch, SrcStr) of
no_strmatch -> string_match(Match, SrcStr);
_ -> no_strmatch
end.
% @private union must match *one* thing
sm_union([SMR | SMRs], Src0) ->
case string_match(SMR, Src0) of
no_strmatch -> sm_union(SMRs, Src0);
Match -> Match
end;
sm_union([], _) ->
no_strmatch.
% @private sequence must match *EACH* thing
sm_seq([SMR | SMRs], Acc, Src0) ->
case string_match(SMR, Src0) of
{strmatch, Str, Src1} -> sm_seq(SMRs, [Acc, Str], Src1);
no_strmatch -> no_strmatch
end;
sm_seq([], Acc, Src) ->
{strmatch, unicode:characters_to_list(Acc), Src}.
% @private plus matches at least one
sm_plus(SMR, Src0) ->
case string_match(SMR, Src0) of
{strmatch, Str, Src1} -> sm_star(SMR, Str, Src1);
no_strmatch -> no_strmatch
end.
% @private star matches 0 or more
sm_star(SMR, Acc, Src0) ->
case string_match(SMR, Src0) of
% 0
no_strmatch ->
{strmatch, unicode:characters_to_list(Acc), Src0};
% or more
{strmatch, Str, Src1} ->
sm_star(SMR, [Acc, Str], Src1)
end.