gsc/src/gs_strmatch.erl

% @doc
% A string matcher is roughly analogous to a regex. It describes a pattern,
% which a string may or may not match.
%
% This module is essentially a pure erlang implementation of the subset of
% regular expressions that are needed to tokenize sophia.
%
% The intent for now (May 2026) is simply to perfectly mimic the so_scan library
%
% Reference is `docs/sophia_syntax.md` as well as `src/so_scan_lib.erl` in
% original sophia lib
%
% From docs/sophia_syntax.md:
%
% - Id = [a-z_][A-Za-z0-9_']* identifiers start with a lower case letter.
% - Con = [A-Z][A-Za-z0-9_]* constructors start with an upper case letter.
% - QId = (Con\.)+Id qualified identifiers (e.g. `Map.member`)
% - QCon = (Con\.)+Con qualified constructor
% - TVar = 'Id type variable (e.g `'a`, `'b`)
% - Int = [0-9]+(_[0-9]+)*|0x[0-9A-Fa-f]+(_[0-9A-Fa-f]+)* integer literal with optional `_` separators
% - Bytes = #[0-9A-Fa-f]+(_[0-9A-Fa-f]+)* byte array literal with optional `_` separators
% - String` string literal enclosed in " with escape character `\`
% - Char character literal enclosed in ' with escape character `\`
% - AccountAddress base58-encoded 32 byte account pubkey with `ak_` prefix
% - ContractAddress base58-encoded 32 byte contract address with `ct_` prefix
% - Signature base58-encoded 64 byte cryptographic signature with `sg_` prefix
%
% Sophia's notion of tokens also includes keywords, parens, whitespace, etc.
% Real reference is of course the code:
%
%       Number   = fun(Digit) -> [Digit, "+(_", Digit, "+)*"] end,
%       DIGIT    = "[0-9]",
%       HEXDIGIT = "[0-9a-fA-F]",
%       LOWER    = "[a-z_]",
%       UPPER    = "[A-Z]",
%       CON      = [UPPER, "[a-zA-Z0-9_]*"],
%       INT      = Number(DIGIT),
%       HEX      = ["0x", Number(HEXDIGIT)],
%       BYTES    = ["#", Number(HEXDIGIT)],
%       WS       = "[\\000-\\ ]+",
%       ID       = [LOWER, "[a-zA-Z0-9_']*"],
%       TVAR     = ["'", ID],
%       QID      = ["(", CON, "\\.)+", ID],
%       QCON     = ["(", CON, "\\.)+", CON],
%       OP       = "[=!<>+\\-*/:&|?~@^]+",
%       %% Five cases for a character
%       %%  * 1 7-bit ascii, not \ or '
%       %%  * 2-4 8-bit values (UTF8)
%       %%  * \ followed by a known modifier [aernrtv]
%       %%  * \xhh
%       %%  * \x{hhh...}
%       CHAR     = "'(([\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f])|([\\x00-\\xff][\\x80-\\xff]{1,3})|(\\\\[befnrtv'\\\\])|(\\\\x[0-9a-fA-F]{2,2})|(\\\\x\\{[0-9a-fA-F]*\\}))'",
%       STRING   = "\"([^\"\\\\]|(\\\\.))*\"",
%
%       CommentStart = {"/\\*", push(comment, skip())},
%       CommentRules =
%           [ CommentStart
%           , {"\\*/",        pop(skip())}
%           , {"[^/*]+|[/*]", skip()} ],
%
%       Keywords = ["contract", "include", "let", "switch", "type", "record", "datatype", "if", "elif", "else", "function",
%                   "stateful", "payable", "true", "false", "mod", "public", "entrypoint", "private", "indexed", "namespace",
%                   "interface", "main", "using", "as", "for", "hiding", "band", "bor", "bxor", "bnot"
%                  ],
%       KW = string:join(Keywords, "|"),
%
% There is a lot going on in that code. This is purely the part that matches
% strings specifically, . The *tokenizer* (gsc_tokenizer) knows the hierarchy
% of sophia tokens (e.g. it knows to match keywords before identifiers, so that
% `contract` gets tokenized as a keyword and not a variable name), and then
% calls into this module in order to match the string shape it's looking for.
% @end
-module(gs_strmatch).

%-compile([export_all, nowarn_export_all]).


-export_type([
    string_matcher/0
]).

% given a string matcher and a string, determine match or no
-export([
    match/2
]).

% string matchers for sophia token shapes
-export([
    smr_sf_ws/0,
    smr_sf_op/0,
    smr_sf_punct/0,
    smr_sf_id/0,
    smr_sf_con/0,
    smr_sf_qid/0,
    smr_sf_qcon/0,
    smr_sf_tvar/0,
    smr_sf_int16/0,
    smr_sf_int10/0,
    smr_sf_bytes/0,
    smr_sf_str/0,
    smr_sf_char/0,
    smr_sf_ak/0,
    smr_sf_ct/0,
    smr_sf_sg/0
]).


% regex primitives/combinators
-export([
    % plumbing
    smr_char/1,
    smr_char_range/2,
    smr_union/1,
    smr_seq/1,
    smr_plus/1,
    smr_star/1,
    smr_dot/0,
    smr_ncmatch/2,
    % porcelain
    smr_string/1,
    smr_oneofchars/1
]).


%%=======================================================================
%% API: Types
%%=======================================================================

-type string_matcher()
    :: {smr_char, integer()}                        % /a/, /b/, /cd/
     | {smr_char_range, integer(), integer()}       % /[a-z]/
     | {smr_union, [string_matcher()]}              % /[abc]/
     | {smr_seq,   [string_matcher()]}              % /abc/
     | {smr_plus,  string_matcher()}                % /(abc)+/
     | {smr_star,  string_matcher()}                % /(abc)*/
     | smr_dot                                      % /./
       % negative conditional match
       % /[^a-z]/, but more general
       % /[^a-z]/ <~> smr_ncmatch(smr_char_range($a, $z), smr_dot()).
     | {smr_ncmatch, MustNotMatch :: string_matcher(),
                     Match        :: string_matcher()}.


%=========================================================
% API: Functions
%=========================================================

%---------------------------------------------------------
% API: string matching logic
%
% -export([
%     match/2
% ]).
%---------------------------------------------------------

-spec match(Matcher, Source) -> MaybeMatch
    when Matcher    :: string_matcher(),
         Source     :: iolist(),
         MaybeMatch :: {strmatch, Matched :: string(), Rest :: string()}
                     | no_strmatch.
% @doc
% normalize input to an nfc list before parsing
%
%   match(Matcher, Source) ->
%       string_match(Matcher, unicode:characters_to_nfc_list(Source)).
% @end
match(Matcher, Source) ->
    string_match(Matcher, unicode:characters_to_nfc_list(Source)).


%---------------------------------------------------------
% API: string matchers for sophia tokens
%
% -export([
%    smr_sf_ws/0,
%    smr_sf_op/0,
%    smr_sf_punct/0,
%    smr_sf_id/0,
%    smr_sf_con/0,
%    smr_sf_qid/0,
%    smr_sf_qcon/0,
%    smr_sf_tvar/0,
%    smr_sf_int16/0,
%    smr_sf_int10/0,
%    smr_sf_bytes/0,
%    smr_sf_str/0,
%    smr_sf_char/0,
%    smr_sf_ak/0,
%    smr_sf_ct/0,
%    smr_sf_sg/0
% ]).
%---------------------------------------------------------

-spec smr_sf_ws() -> string_matcher().
% @doc
% String matcher for whitespace
%
% from so_scan.erl (9.0.0)
%
%    WS       = "[\\000-\\ ]+",
%
% turns out all the ascii codepoints which are 32 or lower are control chars or
% whitespace: https://www.asciitable.com/
% @end
smr_sf_ws() ->
    WhitespaceChars = lists:seq(0, 32),
    smr_plus(smr_oneofchars(WhitespaceChars)).


-spec smr_sf_op() -> string_matcher().
% @doc
% String matcher for a sophia operator
%
% from so_scan.erl (9.0.0)
%
%    OP       = "[=!<>+\\-*/:&|?~@^]+",
% @end
smr_sf_op() ->
    SfOpChars = "=!<>+-*/:&|?~@^",
    SfOpChar = smr_union([smr_char(C) || C <- SfOpChars]),
    smr_plus(SfOpChar).


-spec smr_sf_punct() -> string_matcher().
% @doc
% String matcher for parens/braces
%
% from so_scan.erl (9.0.0)
%
%       , {"\\.\\.|[,.;()\\[\\]{}]", symbol()}
% @end

smr_sf_punct() ->
    M_DotDotOp = smr_string(".."),
    M_PunctChars = smr_oneofchars(",.;()[]{}"),
    smr_union([M_DotDotOp, M_PunctChars]).


-spec smr_sf_id() -> string_matcher().
% @doc
% String matcher for a sophia identifier
%
%   foo
%   _foo
%   fooBar'
%
% - Id = [a-z_][A-Za-z0-9_']* identifiers start with a lower case letter.
% @end

smr_sf_id() ->
    % upper lower digit under quote
    ULDUQ =
        {smr_union, [{smr_char_range, $A, $Z},
                     {smr_char_range, $a, $z},
                     {smr_char_range, $0, $9},
                     {smr_char, $_},
                     {smr_char, $'}]},
    smr_seq([smr_union([smr_char_range($a, $z), smr_char($_)]),
             smr_star(ULDUQ)]).


-spec smr_sf_con() -> string_matcher().
% @doc
% String matcher for a sophia constructor name
%
%   Foo
%   Foo_Bar
%   Foo_Bar3_'
%
% - Con = [A-Z][A-Za-z0-9_']* constructors start with an upper case letter.
% @end

smr_sf_con() ->
    ULDU =
        {smr_union, [{smr_char_range, $A, $Z},
                     {smr_char_range, $a, $z},
                     {smr_char_range, $0, $9},
                     {smr_char, $_}]},
    smr_seq([smr_char_range($A, $Z),
             smr_star(ULDU)]).


-spec smr_sf_qid() -> string_matcher().

% @doc
% String matcher for a Sophia qualified identifier
%
%   Foo.Bar.Baz.quux
%
% - QId = (Con\.)+Id qualified identifiers (e.g. `Map.member`)
% @end
smr_sf_qid() ->
    Qualifier = smr_seq([smr_sf_con(), smr_char($.)]),
    Qualifiers = smr_plus(Qualifier),
    Identifier = smr_sf_id(),
    smr_seq([Qualifiers, Identifier]).


-spec smr_sf_qcon() -> string_matcher().

% @doc
%
% String matcher for a sophia qualified constructor
%
%   Foo.Bar.Baz
%
% - QCon = (Con\.)+Con qualified constructor
% @end
smr_sf_qcon() ->
    Qualifier = smr_seq([smr_sf_con(), smr_char($.)]),
    Qualifiers = smr_plus(Qualifier),
    Constructor = smr_sf_con(),
    smr_seq([Qualifiers, Constructor]).


-spec smr_sf_tvar() -> string_matcher().

% @doc
% String matcher for a sophia type variable; e.g.
%
%   'a
%   'foo_bar
%
% - TVar = 'Id type variable (e.g `'a`, `'b`)
% @end
smr_sf_tvar() ->
    smr_seq([smr_char($'), smr_sf_id()]).


-spec smr_sf_int16() -> string_matcher().

% @doc
% String matcher for a sophia base16 integer 0xDEAD_BEEF
%
% so_scan parses base10/base16 in one go, but i think it's clearer if they're
% different
%
% - Int = [0-9]+(_[0-9]+)*|0x[0-9A-Fa-f]+(_[0-9A-Fa-f]+)* integer literal with optional `_` separators
% @end
smr_sf_int16() ->
    HexDigit   = smr_union([smr_char_range($0, $9),
                            smr_char_range($A, $F),
                            smr_char_range($a, $f)]),
    HexDigits  = smr_plus(HexDigit),
    UHexDigits = smr_seq([smr_char($_), HexDigits]),
    smr_seq([smr_string("0x"), HexDigits, smr_star(UHexDigits)]).


-spec smr_sf_int10() -> string_matcher().

% @doc
% string matcher for a sophia base 10 int 012_345_6_7
%
% so_scan parses base10/base16 in one go, but i think it's clearer if they're
% different
%
% - Int = [0-9]+(_[0-9]+)*|0x[0-9A-Fa-f]+(_[0-9A-Fa-f]+)* integer literal with optional `_` separators
% @end
smr_sf_int10() ->
    DecDigit   = smr_char_range($0, $9),
    DecDigits  = smr_plus(DecDigit),
    UDecDigits = smr_seq([smr_char($_), DecDigits]),
    smr_seq([DecDigits, smr_star(UDecDigits)]).


-spec smr_sf_bytes() -> string_matcher().

% @doc
% String matcher for a sophia bytestring
%
%   #DEAD_BEEF
%
% - Bytes = #[0-9A-Fa-f]+(_[0-9A-Fa-f]+)* byte array literal with optional `_` separators
% @end
smr_sf_bytes() ->
    HexDigit   = smr_union([smr_char_range($0, $9),
                            smr_char_range($A, $F),
                            smr_char_range($a, $f)]),
    HexDigits  = smr_plus(HexDigit),
    UHexDigits = smr_seq([smr_char($_), HexDigits]),
    smr_seq([smr_char($#), HexDigits, smr_star(UHexDigits)]).


-spec smr_sf_str() -> string_matcher().

% @doc
% String matcher for sophia string literal
%
% String string literal enclosed in " with escape character `\`
%
%  STRING   = "\"([^\"\\\\]|(\\\\.))*\"",
% @end
smr_sf_str() ->
    smr_seq([smr_char($"), smr_star(smr_sf_strchar()), smr_char($")]).


-spec smr_sf_strchar() -> string_matcher().
% @private
% string matcher for a character in a sophia string
%
%  STRING   = "\"([^\"\\\\]|(\\\\.))*\"",
%
% this is for
%
%  ([^\"\\\\]|(\\\\.))
%
% cleaned up:
%
%   ([^"\\]|(\\.))
% @end
smr_sf_strchar() ->
    % cannot have a literal newline in string
    %
    %   "foo
    %   bar"
    %
    % is not a valid sophia string
    AnythingButNewline = smr_ncmatch(smr_char($\n), smr_dot()),
    IsASpecialChar = smr_union([smr_char($"), smr_char($\\)]),
    NotEscSeq      = smr_ncmatch(IsASpecialChar, AnythingButNewline),
    % FIXME: maybe we should enfore escape sequence rules here?
    %
    % especially to be consistent with char rules
    EscSeq         = smr_seq([smr_char($\\), AnythingButNewline]),
    smr_union([NotEscSeq, EscSeq]).


-spec smr_sf_char() -> string_matcher().
% @doc
% String matcher for a Sophia char literal
%
% From so_scan.erl:
%
%       %% Five cases for a character
%       %%  * 1 7-bit ascii, not \ or '
%       %%  * 2-4 8-bit values (UTF8)
%       %%  * \ followed by a known modifier [aernrtv]
%       %%  * \xhh
%       %%  * \x{hhh...}
%       CHAR     = "'(([\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f])|([\\x00-\\xff][\\x80-\\xff]{1,3})|(\\\\[befnrtv'\\\\])|(\\\\x[0-9a-fA-F]{2,2})|(\\\\x\\{[0-9a-fA-F]*\\}))'",
%
% > Char character literal enclosed in ' with escape character `\`
% @end

% ok we get this monstrosity
%
%   "'(([\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f])|([\\x00-\\xff][\\x80-\\xff]{1,3})|(\\\\[befnrtv'\\\\])|(\\\\x[0-9a-fA-F]{2,2})|(\\\\x\\{[0-9a-fA-F]*\\}))'"
%
% there's like 4 levels of escaping and shit, so let's break it down. First
% let's notice this pattern:
%
%   '(...)'.
%
% So let's make a hole
smr_sf_char() ->
    smr_seq([smr_char($'), smr_sf_char_inner(), smr_char($')]).

% smr_sf_char_inner() will deal with the stuff in the monstrosity
%
% we had this before
%   "'(([\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f])|([\\x00-\\xff][\\x80-\\xff]{1,3})|(\\\\[befnrtv'\\\\])|(\\\\x[0-9a-fA-F]{2,2})|(\\\\x\\{[0-9a-fA-F]*\\}))'"
%
% let's trim
%   ([\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f])|([\\x00-\\xff][\\x80-\\xff]{1,3})|(\\\\[befnrtv'\\\\])|(\\\\x[0-9a-fA-F]{2,2})|(\\\\x\\{[0-9a-fA-F]*\\})
%
% and reorg
%     ([\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f])
%   | ([\\x00-\\xff][\\x80-\\xff]{1,3})
%   | (\\\\[befnrtv'\\\\])
%   | (\\\\x[0-9a-fA-F]{2,2})
%   | (\\\\x\\{[0-9a-fA-F]*\\})
%
% trim some more
%     [\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f]
%   | [\\x00-\\xff][\\x80-\\xff]{1,3}
%   | \\\\[befnrtv'\\\\]
%   | \\\\x[0-9a-fA-F]{2,2}
%   | \\\\x\\{[0-9a-fA-F]*\\}
%
% undo some escapes
%     [\x00-\x26\x28-\x5b\x5d-\x7f]
%   | [\x00-\xff][\x80-\xff]{1,3}
%   | \\[befnrtv'\\]
%   | \\x[0-9a-fA-F]{2,2}
%   | \\x\{[0-9a-fA-F]*\}
%
% rewrite
%   [^'\]                    <~> (16#00..16#26 | 16#28..16#5b | 16#5d..16#7f)
%   <<_:8, (_ >= 128){1,3}>> <~> [\x00-\xff][\x80-\xff]{1,3}
%   <<$\\, X>>               <~> \\[befnrtv'\\]
%   \xAB                     <~> \\x[0-9a-fA-F]{2,2}
%   \x{DEADBEEF}             <~> \\x\{[0-9a-fA-F]*\}

smr_sf_char_inner() ->
    Escapable = smr_oneofchars("befnrtv'\\"),
    EscSeq    = smr_seq([smr_char($\\), Escapable]),
    HexChar  = smr_oneofchars("0123456789ABCDEFabcdef"),
    HexEsc2  = smr_seq([smr_string("\\x"), HexChar, HexChar]),
    HexEsc   = smr_seq([smr_string("\\x{"), smr_star(HexChar), smr_char($})]),
    % FIXME: possible erroneous oversimplification here
    QuoteOrBackslash = smr_oneofchars([$', $\\]),
    Utf8Char = smr_ncmatch(QuoteOrBackslash, smr_dot()),
    smr_union([EscSeq, HexEsc2, HexEsc, Utf8Char]).


-spec smr_sf_ak() -> string_matcher().
% @doc
% string matcher for
%
%   ak_....
%
% sophia's tokenizer tokenizes ak_.../sg_... etc as identifiers and then in the
% parsing stage disambiguates them
%
% i don't like that, but for version 0.1 we're going to match the behavior of
% `so_scan` exactly, just for clarity
%
% however, note that is the token step, we can still write a string matcher to
% be useful later
%
% > AccountAddress base58-encoded 32 byte account pubkey with `ak_` prefix
% @end
smr_sf_ak() ->
    smr_apistr58("ak").


-spec smr_sf_ct() -> string_matcher().
% @doc
% string matcher for
%
%   ct_....
%
% sophia's tokenizer tokenizes ak_.../sg_... etc as identifiers and then in the
% parsing stage disambiguates them
%
% i don't like that, but for version 0.1 we're going to match the behavior of
% `so_scan` exactly, just for clarity
%
% however, note that is the token step, we can still write a string matcher to
% be useful later
%
% > ContractAddress base58-encoded 32 byte contract address with `ct_` prefix
% @end
smr_sf_ct() ->
    smr_apistr58("ct").


-spec smr_sf_sg() -> string_matcher().
% @doc
% string matcher for
%
%   sg_....
%
% sophia's tokenizer tokenizes ak_.../sg_... etc as identifiers and then in the
% parsing stage disambiguates them
%
% i don't like that, but for version 0.1 we're going to match the behavior of
% `so_scan` exactly, just for clarity
%
% however, note that is the token step, we can still write a string matcher to
% be useful later
%
% > Signature base58-encoded 64 byte cryptographic signature with `sg_` prefix
% @end
smr_sf_sg() ->
    smr_apistr58("sg").


-spec smr_apistr58(Prefix) -> string_matcher()
    when Prefix :: string().
% @private
% string matcher for
%
%   ak_...
%   ct_...
%   sg_...
%
% prefix is given as arg
%
% ... are base58 chars
% @end
smr_apistr58(Prefix) ->
    smr_seq([smr_string(Prefix), smr_char($_), smr_plus(smr_base58char())]).


smr_base58char() ->
    smr_oneofchars("123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz").


%---------------------------------------------------------
% API: string matcher primitive constructors
%---------------------------------------------------------

-spec smr_char(Char) -> string_matcher()
    when Char :: integer().
% @doc
% string matcher for a specific char
%
%   /[abc]/ <~> smr_union([smr_char($a), smr_char($b), smr_char($c)])
%
% @end
smr_char(X) when is_integer(X) ->
    {smr_char, X}.


-spec smr_char_range(LowerBound, UpperBound) -> string_matcher()
    when LowerBound :: integer(),
         UpperBound :: integer().
% @doc
% string matcher for a range of characters
%
%   /[a-z]/ <~> smr_char_range($a, $z)
%   /[0-9]/ <~> smr_char_range($0, $9)
% @end
smr_char_range(X, Y) when is_integer(X), is_integer(Y) ->
    {smr_char_range, X, Y}.


-spec smr_union(StringMatchers) -> string_matcher()
    when StringMatchers :: [StringMatcher],
         StringMatcher  :: string_matcher().
% @doc
% String matcher that matches on the first matcher given that matches
%
%   /[abc]/     <~> smr_union([smr_char($a), smr_char($b), smr_char($c)])
%   /(foo|bar)/ <~> smr_union([smr_string("foo"), smr_string("bar")])
% @end
smr_union(List) when is_list(List) ->
    {smr_union, List}.


-spec smr_seq(StringMatchers) -> string_matcher()
    when StringMatchers :: [string_matcher()].
% @doc
% Match a sequence of matchers
%
%   /abc/ <~> smr_seq([smr_char($a), smr_char($b), smr_char($c)])
%
% smr_string/1 just maps to a sequence of chars
% @end
smr_seq(List) when is_list(List) ->
    {smr_seq, List}.


-spec smr_plus(Matcher) -> string_matcher()
    when Matcher :: string_matcher().
% @doc
% "one or more of"; like the `+` operator in regexes.
%
%   sm_plus(SMR, Src0) ->
%       case string_match(SMR, Src0) of
%           {strmatch, Str, Src1} -> sm_star(SMR, Str, Src1);
%           no_strmatch               -> no_strmatch
%       end.
% @end
smr_plus(SMR) ->
    {smr_plus, SMR}.


-spec smr_star(Matcher) -> string_matcher()
    when Matcher :: string_matcher().
% @doc
% "zero or more of"; like the `*` operator in regexes.
%
%   sm_star(SMR, Acc, Src0) ->
%       case string_match(SMR, Src0) of
%           % 0
%           no_strmatch ->
%               {strmatch, unicode:characters_to_list(Acc), Src0};
%           % or more
%           {strmatch, Str, Src1} ->
%               sm_star(SMR, [Acc, Str], Src1)
%       end.
% @end
smr_star(SMR) ->
    {smr_star, SMR}.


-spec smr_dot() -> string_matcher().
% @doc
% matches every character; analogous to /./
%
% string_match(smr_dot, SrcStr) ->
%     case SrcStr of
%         [C | Rest] -> {strmatch, [C], Rest};
%         []         -> no_strmatch
%     end;
% @end
smr_dot() ->
    smr_dot.


-spec smr_ncmatch(MustNotMatch, Match) -> string_matcher()
    when MustNotMatch :: string_matcher(),
         Match        :: string_matcher().
% @doc
% Negative conditional match; analogous to `[^abc]` but more flexible
%
%
%   /[^abc]/ <-> smr_ncmatch(smr_union([smr_char($a), smr_char($b), smr_char($c)]),
%                            smr_dot()).
%
%
%   string_match({smr_ncmatch, MustNotMatch, Match}, SrcStr) ->
%       case string_match(MustNotMatch, SrcStr) of
%           no_strmatch -> string_match(Match, SrcStr);
%           _       -> no_strmatch
%       end.
%
% @end
smr_ncmatch(A, B) ->
    {smr_ncmatch, A, B}.


%---------------------------------------------------------
% string matcher helpers
%---------------------------------------------------------

-spec smr_string(Chars) -> string_matcher()
    when Chars :: string().
% @doc
% matches chars given in sequence; basically like putting the string in raw in
% a regex
%
%   /foo/ <~> smr_string("foo")
%         <~> smr_seq([smr_char($f), smr_char($o), smr_char($o)])
%
% rewrite over smr_seq/1 and smr_char/1
%
%   smr_string(String) when is_list(String) ->
%       smr_seq([smr_char(C) || C <- String]).
% @end
smr_string(String) when is_list(String) ->
    smr_seq([smr_char(C) || C <- String]).


-spec smr_oneofchars(Chars) -> UnionMatcher
    when Chars        :: string(),
         UnionMatcher :: string_matcher().
% @doc
% String matcher for one of chars
%
%  /[abc]/ <~> smr_costring("abc")
%          <~> smr_union([smr_char($f), smr_char($o), smr_char($o)])
%
% this is the dual of smr_string/1. string puts chars in sequence, this puts
% chars in parallel.
%
% "costring" nomenclature is chosen specifically to annoy craig
%
% if you fix your stupid url schema i will consider changing this name
%
% the thing is though this is actually a good name, your url schema is just...
% well you know it's compact, so you have amazon beat. no page-long urls for
% gajumarket
%
% you know what, we're keeping both names
%
% i'm confusing myself, renaming to "oneofchars"
% @end
smr_oneofchars(Chars) ->
    smr_union([smr_char(C) || C <- Chars]).


%%=======================================================================
%% INTERNALS: string matching logic
%%=======================================================================


-spec string_match(Matcher, Source) -> MaybeMatch
    when Matcher    :: string_matcher(),
         Source     :: string(),
         MaybeMatch :: {strmatch, Matched :: string(), Rest :: string()}
                     | no_strmatch.
% @private
% See if the source matches the given matcher; returns
%
%   %% NOTIONAL code
%   string_match(/[abc]/, "abc") ->
%       {strmatch, "a", "bc"}
%   string_match(/[abc]/, "def") ->
%       no_strmatch
% @end
string_match({smr_char, C}, SrcStr) ->
    case SrcStr of
        [X | Rest] when X =:= C -> {strmatch, [C], Rest};
        _                       -> no_strmatch
    end;
string_match({smr_char_range, X, Y}, Src0) ->
    case Src0 of
        [C | Src1] when X =< C, C =< Y -> {strmatch, [C], Src1};
        _                              -> no_strmatch
    end;
string_match({smr_union, SMRs}, Src0) ->
    sm_union(SMRs, Src0);
string_match({smr_seq, SMRs}, Src0) ->
    sm_seq(SMRs, [], Src0);
string_match({smr_plus, SMR}, Src0) ->
    sm_plus(SMR, Src0);
string_match({smr_star, SMR}, Src0) ->
    sm_star(SMR, [], Src0);
string_match(smr_dot, SrcStr) ->
    case SrcStr of
        [C | Rest] -> {strmatch, [C], Rest};
        []         -> no_strmatch
    end;
string_match({smr_ncmatch, MustNotMatch, Match}, SrcStr) ->
    case string_match(MustNotMatch, SrcStr) of
        no_strmatch -> string_match(Match, SrcStr);
        _           -> no_strmatch
    end.


% @private union must match *one* thing
sm_union([SMR | SMRs], Src0) ->
    case string_match(SMR, Src0) of
        no_strmatch -> sm_union(SMRs, Src0);
        Match       -> Match
    end;
sm_union([], _) ->
    no_strmatch.


% @private sequence must match *EACH* thing
sm_seq([SMR | SMRs], Acc, Src0) ->
    case string_match(SMR, Src0) of
        {strmatch, Str, Src1} -> sm_seq(SMRs, [Acc, Str], Src1);
        no_strmatch           -> no_strmatch
    end;
sm_seq([], Acc, Src) ->
    {strmatch, unicode:characters_to_list(Acc), Src}.


% @private plus matches at least one
sm_plus(SMR, Src0) ->
    case string_match(SMR, Src0) of
        {strmatch, Str, Src1} -> sm_star(SMR, Str, Src1);
        no_strmatch           -> no_strmatch
    end.


% @private star matches 0 or more
sm_star(SMR, Acc, Src0) ->
    case string_match(SMR, Src0) of
        % 0
        no_strmatch ->
            {strmatch, unicode:characters_to_list(Acc), Src0};
        % or more
        {strmatch, Str, Src1} ->
            sm_star(SMR, [Acc, Str], Src1)
    end.