% @doc % A string matcher is roughly analogous to a regex. It describes a pattern, % which a string may or may not match. % % This module is essentially a pure erlang implementation of the subset of % regular expressions that are needed to tokenize sophia. % % The intent for now (May 2026) is simply to perfectly mimic the so_scan library % % Reference is `docs/sophia_syntax.md` as well as `src/so_scan_lib.erl` in % original sophia lib % % From docs/sophia_syntax.md: % % - Id = [a-z_][A-Za-z0-9_']* identifiers start with a lower case letter. % - Con = [A-Z][A-Za-z0-9_]* constructors start with an upper case letter. % - QId = (Con\.)+Id qualified identifiers (e.g. `Map.member`) % - QCon = (Con\.)+Con qualified constructor % - TVar = 'Id type variable (e.g `'a`, `'b`) % - Int = [0-9]+(_[0-9]+)*|0x[0-9A-Fa-f]+(_[0-9A-Fa-f]+)* integer literal with optional `_` separators % - Bytes = #[0-9A-Fa-f]+(_[0-9A-Fa-f]+)* byte array literal with optional `_` separators % - String` string literal enclosed in " with escape character `\` % - Char character literal enclosed in ' with escape character `\` % - AccountAddress base58-encoded 32 byte account pubkey with `ak_` prefix % - ContractAddress base58-encoded 32 byte contract address with `ct_` prefix % - Signature base58-encoded 64 byte cryptographic signature with `sg_` prefix % % Sophia's notion of tokens also includes keywords, parens, whitespace, etc. % Real reference is of course the code: % % Number = fun(Digit) -> [Digit, "+(_", Digit, "+)*"] end, % DIGIT = "[0-9]", % HEXDIGIT = "[0-9a-fA-F]", % LOWER = "[a-z_]", % UPPER = "[A-Z]", % CON = [UPPER, "[a-zA-Z0-9_]*"], % INT = Number(DIGIT), % HEX = ["0x", Number(HEXDIGIT)], % BYTES = ["#", Number(HEXDIGIT)], % WS = "[\\000-\\ ]+", % ID = [LOWER, "[a-zA-Z0-9_']*"], % TVAR = ["'", ID], % QID = ["(", CON, "\\.)+", ID], % QCON = ["(", CON, "\\.)+", CON], % OP = "[=!<>+\\-*/:&|?~@^]+", % %% Five cases for a character % %% * 1 7-bit ascii, not \ or ' % %% * 2-4 8-bit values (UTF8) % %% * \ followed by a known modifier [aernrtv] % %% * \xhh % %% * \x{hhh...} % CHAR = "'(([\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f])|([\\x00-\\xff][\\x80-\\xff]{1,3})|(\\\\[befnrtv'\\\\])|(\\\\x[0-9a-fA-F]{2,2})|(\\\\x\\{[0-9a-fA-F]*\\}))'", % STRING = "\"([^\"\\\\]|(\\\\.))*\"", % % CommentStart = {"/\\*", push(comment, skip())}, % CommentRules = % [ CommentStart % , {"\\*/", pop(skip())} % , {"[^/*]+|[/*]", skip()} ], % % Keywords = ["contract", "include", "let", "switch", "type", "record", "datatype", "if", "elif", "else", "function", % "stateful", "payable", "true", "false", "mod", "public", "entrypoint", "private", "indexed", "namespace", % "interface", "main", "using", "as", "for", "hiding", "band", "bor", "bxor", "bnot" % ], % KW = string:join(Keywords, "|"), % % There is a lot going on in that code. This is purely the part that matches % strings specifically, . The *tokenizer* (gsc_tokenizer) knows the hierarchy % of sophia tokens (e.g. it knows to match keywords before identifiers, so that % `contract` gets tokenized as a keyword and not a variable name), and then % calls into this module in order to match the string shape it's looking for. % @end -module(gs_strmatch). %-compile([export_all, nowarn_export_all]). -export_type([ string_matcher/0 ]). % given a string matcher and a string, determine match or no -export([ match/2 ]). % string matchers for sophia token shapes -export([ smr_sf_ws/0, smr_sf_op/0, smr_sf_punct/0, smr_sf_id/0, smr_sf_con/0, smr_sf_qid/0, smr_sf_qcon/0, smr_sf_tvar/0, smr_sf_int16/0, smr_sf_int10/0, smr_sf_bytes/0, smr_sf_str/0, smr_sf_char/0, smr_sf_ak/0, smr_sf_ct/0, smr_sf_sg/0 ]). % regex primitives/combinators -export([ % plumbing smr_char/1, smr_char_range/2, smr_union/1, smr_seq/1, smr_plus/1, smr_star/1, smr_dot/0, smr_ncmatch/2, % porcelain smr_string/1, smr_oneofchars/1 ]). %%======================================================================= %% API: Types %%======================================================================= -type string_matcher() :: {smr_char, integer()} % /a/, /b/, /cd/ | {smr_char_range, integer(), integer()} % /[a-z]/ | {smr_union, [string_matcher()]} % /[abc]/ | {smr_seq, [string_matcher()]} % /abc/ | {smr_plus, string_matcher()} % /(abc)+/ | {smr_star, string_matcher()} % /(abc)*/ | smr_dot % /./ % negative conditional match % /[^a-z]/, but more general % /[^a-z]/ <~> smr_ncmatch(smr_char_range($a, $z), smr_dot()). | {smr_ncmatch, MustNotMatch :: string_matcher(), Match :: string_matcher()}. %========================================================= % API: Functions %========================================================= %--------------------------------------------------------- % API: string matching logic % % -export([ % match/2 % ]). %--------------------------------------------------------- -spec match(Matcher, Source) -> MaybeMatch when Matcher :: string_matcher(), Source :: iolist(), MaybeMatch :: {strmatch, Matched :: string(), Rest :: string()} | no_strmatch. % @doc % normalize input to an nfc list before parsing % % match(Matcher, Source) -> % string_match(Matcher, unicode:characters_to_nfc_list(Source)). % @end match(Matcher, Source) -> string_match(Matcher, unicode:characters_to_nfc_list(Source)). %--------------------------------------------------------- % API: string matchers for sophia tokens % % -export([ % smr_sf_ws/0, % smr_sf_op/0, % smr_sf_punct/0, % smr_sf_id/0, % smr_sf_con/0, % smr_sf_qid/0, % smr_sf_qcon/0, % smr_sf_tvar/0, % smr_sf_int16/0, % smr_sf_int10/0, % smr_sf_bytes/0, % smr_sf_str/0, % smr_sf_char/0, % smr_sf_ak/0, % smr_sf_ct/0, % smr_sf_sg/0 % ]). %--------------------------------------------------------- -spec smr_sf_ws() -> string_matcher(). % @doc % String matcher for whitespace % % from so_scan.erl (9.0.0) % % WS = "[\\000-\\ ]+", % % turns out all the ascii codepoints which are 32 or lower are control chars or % whitespace: https://www.asciitable.com/ % @end smr_sf_ws() -> WhitespaceChars = lists:seq(0, 32), smr_plus(smr_oneofchars(WhitespaceChars)). -spec smr_sf_op() -> string_matcher(). % @doc % String matcher for a sophia operator % % from so_scan.erl (9.0.0) % % OP = "[=!<>+\\-*/:&|?~@^]+", % @end smr_sf_op() -> SfOpChars = "=!<>+-*/:&|?~@^", SfOpChar = smr_union([smr_char(C) || C <- SfOpChars]), smr_plus(SfOpChar). -spec smr_sf_punct() -> string_matcher(). % @doc % String matcher for parens/braces % % from so_scan.erl (9.0.0) % % , {"\\.\\.|[,.;()\\[\\]{}]", symbol()} % @end smr_sf_punct() -> M_DotDotOp = smr_string(".."), M_PunctChars = smr_oneofchars(",.;()[]{}"), smr_union([M_DotDotOp, M_PunctChars]). -spec smr_sf_id() -> string_matcher(). % @doc % String matcher for a sophia identifier % % foo % _foo % fooBar' % % - Id = [a-z_][A-Za-z0-9_']* identifiers start with a lower case letter. % @end smr_sf_id() -> % upper lower digit under quote ULDUQ = {smr_union, [{smr_char_range, $A, $Z}, {smr_char_range, $a, $z}, {smr_char_range, $0, $9}, {smr_char, $_}, {smr_char, $'}]}, smr_seq([smr_union([smr_char_range($a, $z), smr_char($_)]), smr_star(ULDUQ)]). -spec smr_sf_con() -> string_matcher(). % @doc % String matcher for a sophia constructor name % % Foo % Foo_Bar % Foo_Bar3_' % % - Con = [A-Z][A-Za-z0-9_']* constructors start with an upper case letter. % @end smr_sf_con() -> ULDU = {smr_union, [{smr_char_range, $A, $Z}, {smr_char_range, $a, $z}, {smr_char_range, $0, $9}, {smr_char, $_}]}, smr_seq([smr_char_range($A, $Z), smr_star(ULDU)]). -spec smr_sf_qid() -> string_matcher(). % @doc % String matcher for a Sophia qualified identifier % % Foo.Bar.Baz.quux % % - QId = (Con\.)+Id qualified identifiers (e.g. `Map.member`) % @end smr_sf_qid() -> Qualifier = smr_seq([smr_sf_con(), smr_char($.)]), Qualifiers = smr_plus(Qualifier), Identifier = smr_sf_id(), smr_seq([Qualifiers, Identifier]). -spec smr_sf_qcon() -> string_matcher(). % @doc % % String matcher for a sophia qualified constructor % % Foo.Bar.Baz % % - QCon = (Con\.)+Con qualified constructor % @end smr_sf_qcon() -> Qualifier = smr_seq([smr_sf_con(), smr_char($.)]), Qualifiers = smr_plus(Qualifier), Constructor = smr_sf_con(), smr_seq([Qualifiers, Constructor]). -spec smr_sf_tvar() -> string_matcher(). % @doc % String matcher for a sophia type variable; e.g. % % 'a % 'foo_bar % % - TVar = 'Id type variable (e.g `'a`, `'b`) % @end smr_sf_tvar() -> smr_seq([smr_char($'), smr_sf_id()]). -spec smr_sf_int16() -> string_matcher(). % @doc % String matcher for a sophia base16 integer 0xDEAD_BEEF % % so_scan parses base10/base16 in one go, but i think it's clearer if they're % different % % - Int = [0-9]+(_[0-9]+)*|0x[0-9A-Fa-f]+(_[0-9A-Fa-f]+)* integer literal with optional `_` separators % @end smr_sf_int16() -> HexDigit = smr_union([smr_char_range($0, $9), smr_char_range($A, $F), smr_char_range($a, $f)]), HexDigits = smr_plus(HexDigit), UHexDigits = smr_seq([smr_char($_), HexDigits]), smr_seq([smr_string("0x"), HexDigits, smr_star(UHexDigits)]). -spec smr_sf_int10() -> string_matcher(). % @doc % string matcher for a sophia base 10 int 012_345_6_7 % % so_scan parses base10/base16 in one go, but i think it's clearer if they're % different % % - Int = [0-9]+(_[0-9]+)*|0x[0-9A-Fa-f]+(_[0-9A-Fa-f]+)* integer literal with optional `_` separators % @end smr_sf_int10() -> DecDigit = smr_char_range($0, $9), DecDigits = smr_plus(DecDigit), UDecDigits = smr_seq([smr_char($_), DecDigits]), smr_seq([DecDigits, smr_star(UDecDigits)]). -spec smr_sf_bytes() -> string_matcher(). % @doc % String matcher for a sophia bytestring % % #DEAD_BEEF % % - Bytes = #[0-9A-Fa-f]+(_[0-9A-Fa-f]+)* byte array literal with optional `_` separators % @end smr_sf_bytes() -> HexDigit = smr_union([smr_char_range($0, $9), smr_char_range($A, $F), smr_char_range($a, $f)]), HexDigits = smr_plus(HexDigit), UHexDigits = smr_seq([smr_char($_), HexDigits]), smr_seq([smr_char($#), HexDigits, smr_star(UHexDigits)]). -spec smr_sf_str() -> string_matcher(). % @doc % String matcher for sophia string literal % % String string literal enclosed in " with escape character `\` % % STRING = "\"([^\"\\\\]|(\\\\.))*\"", % @end smr_sf_str() -> smr_seq([smr_char($"), smr_star(smr_sf_strchar()), smr_char($")]). -spec smr_sf_strchar() -> string_matcher(). % @private % string matcher for a character in a sophia string % % STRING = "\"([^\"\\\\]|(\\\\.))*\"", % % this is for % % ([^\"\\\\]|(\\\\.)) % % cleaned up: % % ([^"\\]|(\\.)) % @end smr_sf_strchar() -> % cannot have a literal newline in string % % "foo % bar" % % is not a valid sophia string AnythingButNewline = smr_ncmatch(smr_char($\n), smr_dot()), IsASpecialChar = smr_union([smr_char($"), smr_char($\\)]), NotEscSeq = smr_ncmatch(IsASpecialChar, AnythingButNewline), % FIXME: maybe we should enfore escape sequence rules here? % % especially to be consistent with char rules EscSeq = smr_seq([smr_char($\\), AnythingButNewline]), smr_union([NotEscSeq, EscSeq]). -spec smr_sf_char() -> string_matcher(). % @doc % String matcher for a Sophia char literal % % From so_scan.erl: % % %% Five cases for a character % %% * 1 7-bit ascii, not \ or ' % %% * 2-4 8-bit values (UTF8) % %% * \ followed by a known modifier [aernrtv] % %% * \xhh % %% * \x{hhh...} % CHAR = "'(([\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f])|([\\x00-\\xff][\\x80-\\xff]{1,3})|(\\\\[befnrtv'\\\\])|(\\\\x[0-9a-fA-F]{2,2})|(\\\\x\\{[0-9a-fA-F]*\\}))'", % % > Char character literal enclosed in ' with escape character `\` % @end % ok we get this monstrosity % % "'(([\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f])|([\\x00-\\xff][\\x80-\\xff]{1,3})|(\\\\[befnrtv'\\\\])|(\\\\x[0-9a-fA-F]{2,2})|(\\\\x\\{[0-9a-fA-F]*\\}))'" % % there's like 4 levels of escaping and shit, so let's break it down. First % let's notice this pattern: % % '(...)'. % % So let's make a hole smr_sf_char() -> smr_seq([smr_char($'), smr_sf_char_inner(), smr_char($')]). % smr_sf_char_inner() will deal with the stuff in the monstrosity % % we had this before % "'(([\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f])|([\\x00-\\xff][\\x80-\\xff]{1,3})|(\\\\[befnrtv'\\\\])|(\\\\x[0-9a-fA-F]{2,2})|(\\\\x\\{[0-9a-fA-F]*\\}))'" % % let's trim % ([\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f])|([\\x00-\\xff][\\x80-\\xff]{1,3})|(\\\\[befnrtv'\\\\])|(\\\\x[0-9a-fA-F]{2,2})|(\\\\x\\{[0-9a-fA-F]*\\}) % % and reorg % ([\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f]) % | ([\\x00-\\xff][\\x80-\\xff]{1,3}) % | (\\\\[befnrtv'\\\\]) % | (\\\\x[0-9a-fA-F]{2,2}) % | (\\\\x\\{[0-9a-fA-F]*\\}) % % trim some more % [\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f] % | [\\x00-\\xff][\\x80-\\xff]{1,3} % | \\\\[befnrtv'\\\\] % | \\\\x[0-9a-fA-F]{2,2} % | \\\\x\\{[0-9a-fA-F]*\\} % % undo some escapes % [\x00-\x26\x28-\x5b\x5d-\x7f] % | [\x00-\xff][\x80-\xff]{1,3} % | \\[befnrtv'\\] % | \\x[0-9a-fA-F]{2,2} % | \\x\{[0-9a-fA-F]*\} % % rewrite % [^'\] <~> (16#00..16#26 | 16#28..16#5b | 16#5d..16#7f) % <<_:8, (_ >= 128){1,3}>> <~> [\x00-\xff][\x80-\xff]{1,3} % <<$\\, X>> <~> \\[befnrtv'\\] % \xAB <~> \\x[0-9a-fA-F]{2,2} % \x{DEADBEEF} <~> \\x\{[0-9a-fA-F]*\} smr_sf_char_inner() -> Escapable = smr_oneofchars("befnrtv'\\"), EscSeq = smr_seq([smr_char($\\), Escapable]), HexChar = smr_oneofchars("0123456789ABCDEFabcdef"), HexEsc2 = smr_seq([smr_string("\\x"), HexChar, HexChar]), HexEsc = smr_seq([smr_string("\\x{"), smr_star(HexChar), smr_char($})]), % FIXME: possible erroneous oversimplification here QuoteOrBackslash = smr_oneofchars([$', $\\]), Utf8Char = smr_ncmatch(QuoteOrBackslash, smr_dot()), smr_union([EscSeq, HexEsc2, HexEsc, Utf8Char]). -spec smr_sf_ak() -> string_matcher(). % @doc % string matcher for % % ak_.... % % sophia's tokenizer tokenizes ak_.../sg_... etc as identifiers and then in the % parsing stage disambiguates them % % i don't like that, but for version 0.1 we're going to match the behavior of % `so_scan` exactly, just for clarity % % however, note that is the token step, we can still write a string matcher to % be useful later % % > AccountAddress base58-encoded 32 byte account pubkey with `ak_` prefix % @end smr_sf_ak() -> smr_apistr58("ak"). -spec smr_sf_ct() -> string_matcher(). % @doc % string matcher for % % ct_.... % % sophia's tokenizer tokenizes ak_.../sg_... etc as identifiers and then in the % parsing stage disambiguates them % % i don't like that, but for version 0.1 we're going to match the behavior of % `so_scan` exactly, just for clarity % % however, note that is the token step, we can still write a string matcher to % be useful later % % > ContractAddress base58-encoded 32 byte contract address with `ct_` prefix % @end smr_sf_ct() -> smr_apistr58("ct"). -spec smr_sf_sg() -> string_matcher(). % @doc % string matcher for % % sg_.... % % sophia's tokenizer tokenizes ak_.../sg_... etc as identifiers and then in the % parsing stage disambiguates them % % i don't like that, but for version 0.1 we're going to match the behavior of % `so_scan` exactly, just for clarity % % however, note that is the token step, we can still write a string matcher to % be useful later % % > Signature base58-encoded 64 byte cryptographic signature with `sg_` prefix % @end smr_sf_sg() -> smr_apistr58("sg"). -spec smr_apistr58(Prefix) -> string_matcher() when Prefix :: string(). % @private % string matcher for % % ak_... % ct_... % sg_... % % prefix is given as arg % % ... are base58 chars % @end smr_apistr58(Prefix) -> smr_seq([smr_string(Prefix), smr_char($_), smr_plus(smr_base58char())]). smr_base58char() -> smr_oneofchars("123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz"). %--------------------------------------------------------- % API: string matcher primitive constructors %--------------------------------------------------------- -spec smr_char(Char) -> string_matcher() when Char :: integer(). % @doc % string matcher for a specific char % % /[abc]/ <~> smr_union([smr_char($a), smr_char($b), smr_char($c)]) % % @end smr_char(X) when is_integer(X) -> {smr_char, X}. -spec smr_char_range(LowerBound, UpperBound) -> string_matcher() when LowerBound :: integer(), UpperBound :: integer(). % @doc % string matcher for a range of characters % % /[a-z]/ <~> smr_char_range($a, $z) % /[0-9]/ <~> smr_char_range($0, $9) % @end smr_char_range(X, Y) when is_integer(X), is_integer(Y) -> {smr_char_range, X, Y}. -spec smr_union(StringMatchers) -> string_matcher() when StringMatchers :: [StringMatcher], StringMatcher :: string_matcher(). % @doc % String matcher that matches on the first matcher given that matches % % /[abc]/ <~> smr_union([smr_char($a), smr_char($b), smr_char($c)]) % /(foo|bar)/ <~> smr_union([smr_string("foo"), smr_string("bar")]) % @end smr_union(List) when is_list(List) -> {smr_union, List}. -spec smr_seq(StringMatchers) -> string_matcher() when StringMatchers :: [string_matcher()]. % @doc % Match a sequence of matchers % % /abc/ <~> smr_seq([smr_char($a), smr_char($b), smr_char($c)]) % % smr_string/1 just maps to a sequence of chars % @end smr_seq(List) when is_list(List) -> {smr_seq, List}. -spec smr_plus(Matcher) -> string_matcher() when Matcher :: string_matcher(). % @doc % "one or more of"; like the `+` operator in regexes. % % sm_plus(SMR, Src0) -> % case string_match(SMR, Src0) of % {strmatch, Str, Src1} -> sm_star(SMR, Str, Src1); % no_strmatch -> no_strmatch % end. % @end smr_plus(SMR) -> {smr_plus, SMR}. -spec smr_star(Matcher) -> string_matcher() when Matcher :: string_matcher(). % @doc % "zero or more of"; like the `*` operator in regexes. % % sm_star(SMR, Acc, Src0) -> % case string_match(SMR, Src0) of % % 0 % no_strmatch -> % {strmatch, unicode:characters_to_list(Acc), Src0}; % % or more % {strmatch, Str, Src1} -> % sm_star(SMR, [Acc, Str], Src1) % end. % @end smr_star(SMR) -> {smr_star, SMR}. -spec smr_dot() -> string_matcher(). % @doc % matches every character; analogous to /./ % % string_match(smr_dot, SrcStr) -> % case SrcStr of % [C | Rest] -> {strmatch, [C], Rest}; % [] -> no_strmatch % end; % @end smr_dot() -> smr_dot. -spec smr_ncmatch(MustNotMatch, Match) -> string_matcher() when MustNotMatch :: string_matcher(), Match :: string_matcher(). % @doc % Negative conditional match; analogous to `[^abc]` but more flexible % % % /[^abc]/ <-> smr_ncmatch(smr_union([smr_char($a), smr_char($b), smr_char($c)]), % smr_dot()). % % % string_match({smr_ncmatch, MustNotMatch, Match}, SrcStr) -> % case string_match(MustNotMatch, SrcStr) of % no_strmatch -> string_match(Match, SrcStr); % _ -> no_strmatch % end. % % @end smr_ncmatch(A, B) -> {smr_ncmatch, A, B}. %--------------------------------------------------------- % string matcher helpers %--------------------------------------------------------- -spec smr_string(Chars) -> string_matcher() when Chars :: string(). % @doc % matches chars given in sequence; basically like putting the string in raw in % a regex % % /foo/ <~> smr_string("foo") % <~> smr_seq([smr_char($f), smr_char($o), smr_char($o)]) % % rewrite over smr_seq/1 and smr_char/1 % % smr_string(String) when is_list(String) -> % smr_seq([smr_char(C) || C <- String]). % @end smr_string(String) when is_list(String) -> smr_seq([smr_char(C) || C <- String]). -spec smr_oneofchars(Chars) -> UnionMatcher when Chars :: string(), UnionMatcher :: string_matcher(). % @doc % String matcher for one of chars % % /[abc]/ <~> smr_costring("abc") % <~> smr_union([smr_char($f), smr_char($o), smr_char($o)]) % % this is the dual of smr_string/1. string puts chars in sequence, this puts % chars in parallel. % % "costring" nomenclature is chosen specifically to annoy craig % % if you fix your stupid url schema i will consider changing this name % % the thing is though this is actually a good name, your url schema is just... % well you know it's compact, so you have amazon beat. no page-long urls for % gajumarket % % you know what, we're keeping both names % % i'm confusing myself, renaming to "oneofchars" % @end smr_oneofchars(Chars) -> smr_union([smr_char(C) || C <- Chars]). %%======================================================================= %% INTERNALS: string matching logic %%======================================================================= -spec string_match(Matcher, Source) -> MaybeMatch when Matcher :: string_matcher(), Source :: string(), MaybeMatch :: {strmatch, Matched :: string(), Rest :: string()} | no_strmatch. % @private % See if the source matches the given matcher; returns % % %% NOTIONAL code % string_match(/[abc]/, "abc") -> % {strmatch, "a", "bc"} % string_match(/[abc]/, "def") -> % no_strmatch % @end string_match({smr_char, C}, SrcStr) -> case SrcStr of [X | Rest] when X =:= C -> {strmatch, [C], Rest}; _ -> no_strmatch end; string_match({smr_char_range, X, Y}, Src0) -> case Src0 of [C | Src1] when X =< C, C =< Y -> {strmatch, [C], Src1}; _ -> no_strmatch end; string_match({smr_union, SMRs}, Src0) -> sm_union(SMRs, Src0); string_match({smr_seq, SMRs}, Src0) -> sm_seq(SMRs, [], Src0); string_match({smr_plus, SMR}, Src0) -> sm_plus(SMR, Src0); string_match({smr_star, SMR}, Src0) -> sm_star(SMR, [], Src0); string_match(smr_dot, SrcStr) -> case SrcStr of [C | Rest] -> {strmatch, [C], Rest}; [] -> no_strmatch end; string_match({smr_ncmatch, MustNotMatch, Match}, SrcStr) -> case string_match(MustNotMatch, SrcStr) of no_strmatch -> string_match(Match, SrcStr); _ -> no_strmatch end. % @private union must match *one* thing sm_union([SMR | SMRs], Src0) -> case string_match(SMR, Src0) of no_strmatch -> sm_union(SMRs, Src0); Match -> Match end; sm_union([], _) -> no_strmatch. % @private sequence must match *EACH* thing sm_seq([SMR | SMRs], Acc, Src0) -> case string_match(SMR, Src0) of {strmatch, Str, Src1} -> sm_seq(SMRs, [Acc, Str], Src1); no_strmatch -> no_strmatch end; sm_seq([], Acc, Src) -> {strmatch, unicode:characters_to_list(Acc), Src}. % @private plus matches at least one sm_plus(SMR, Src0) -> case string_match(SMR, Src0) of {strmatch, Str, Src1} -> sm_star(SMR, Str, Src1); no_strmatch -> no_strmatch end. % @private star matches 0 or more sm_star(SMR, Acc, Src0) -> case string_match(SMR, Src0) of % 0 no_strmatch -> {strmatch, unicode:characters_to_list(Acc), Src0}; % or more {strmatch, Str, Src1} -> sm_star(SMR, [Acc, Str], Src1) end.