% @doc % Ref: so_scan.erl % % This file contains a sophia tokenizer written in straightforward erlang with data % types that are sane. % % For MVP it mimics the behavior of so_scan exactly, in terms of like what its % definition of a token is and so on. % % gsc_so_scan.erl contains a compatibility layer that should agree with so_scan % exactly. It converts the data types here to the shapes that so_scan outputs. % % This is for two reasons: % % 1. in order to enable testing the two modules against each other, and % 2. to future-proof in case we decide to incrementally incorporate the gsc % code into the legacy sophia compiler % @end -module(gs_tokens). % meta -export([ token_shapes_parse_order/0, kwds/0 ]). -export([ take_while/2, take_while/3, take_block/1, take_block_item/1, strings/2, slurp_plist/1 ]). % token slurping -export([ indent_level/1, is_significant/1, filter_significant/1, significant_tokens/1, tokens_from_iolist/1, tokens/1, slurp_token/2, slurp_token_shapes/3, slurp_token_of_shape/3, new_pos/2 ]). -include("$gsc_include/gsc.hrl"). %======================================================= % API: functions %======================================================= -spec strings(N, Tokens) -> AtMostNStrings when N :: non_neg_integer(), Tokens :: [tk()], AtMostNStrings :: [string()]. % @doc return the strings of the first N tokens strings(N, [#tk{str = S} | Rest]) when is_integer(N), N >= 1 -> [S | strings(N-1, Rest)]; strings(_, []) -> []; strings(0, _) -> []. % used by parser % % a block is a column-delimited list of block items % % BLOCK = % foo % ... % bar % ... % baz % ... % % BLOCK_ITEM = % foo % ... -spec take_block(Tokens) -> {BlockTokens, Rest} when Tokens :: [tk()], BlockTokens :: Tokens, Rest :: Tokens. % @doc % takes all tokens whose column position is >= the column position of % the head token take_block([H = #tk{pos = {_, BlkCol}} | T]) -> TokenInBlock = fun(#tk{pos = {_, TkCol}}) -> BlkCol =< TkCol end, take_while(TokenInBlock, [H], T); take_block([]) -> {[], []}. -spec take_block_item(Tokens) -> {ItemTokens, Rest} when Tokens :: [tk()], ItemTokens :: Tokens, Rest :: Tokens. % @doc % takes all tokens whose column position is > the column position of % the head token take_block_item([H = #tk{pos = {_, ItemCol}} | T]) -> TokenInItem = fun(#tk{pos = {_, TkCol}}) -> ItemCol < TkCol end, take_while(TokenInItem, [H], T); take_block_item([]) -> {[], []}. -spec slurp_plist(Tokens) -> Result when Tokens :: [Token], Result :: {slurp, PList :: Tokens, After :: Tokens} | {error, Mismatch}, Mismatch :: {fixme, mismatch, OpenStack, ClosedBy}, OpenStack :: Tokens, ClosedBy :: none | {value, Token}, Token :: tk(). % @doc % the verbiage here is `slurp' rather than `take' because we insist on % delimiter matching. % % typical happy path: % "(foo, bar) baz" ~> {slurp, "(foo, bar)", "baz"} % "() baz" ~> {slurp, "()", "baz"} % "foo () baz" ~> {slurp, "", "foo () baz"} % "(foo, bar) baz" ~> {slurp, "(foo, bar)", "baz"} % % typical sad path: % "(foo, bar]" ~> {mismatch, ["("], {value, "]"}} % "(foo, bar" ~> {mismatch, ["("], none} % "([foo, bar)" ~> {mismatch, ["[", "("], {value, ")"}} % % counterintuitive: % "[foo, bar) baz" ~> {slurp, "", "[foo, bar) baz"} % "~!!\inv4l1d syntax" ~> {slurp, "", "~!!\inv4l1d syntax"} % "(foo, bar)(baz)" ~> {slurp, "(foo bar)", "(baz)"} % % the only "syntax checking" occurring is making sure the delimiter % stack pushes and pops properly % % please note that on mismatch, the list of open delimiters is % returned in STACK order, meaning the most recent open delimiters % first. this is more convenient for programs, but might be % counterintuitive to end-users (who are programmers, entirely % unfamiliar with notions like stacks and open/close delimiters) slurp_plist([Hd = #tk{str = "("} | Tl]) -> slurp_dlist([Hd], [Hd], Tl); slurp_plist(Tks) -> {slurp, [], Tks}. % happy terminal case: stack popped entirely slurp_dlist(All, [], NewTokens) -> {slurp, lists:reverse(All), NewTokens}; % WMA stack is nonempty % happy cases of opens getting popped slurp_dlist(All, [#tk{str = "("} | NewOpen], [#tk{str = ")"} = Tk | NewTks]) -> slurp_dlist([Tk | All], NewOpen, NewTks); slurp_dlist(All, [#tk{str = "["} | NewOpen], [#tk{str = "]"} = Tk | NewTks]) -> slurp_dlist([Tk | All], NewOpen, NewTks); slurp_dlist(All, [#tk{str = "{"} | NewOpen], [#tk{str = "}"} = Tk | NewTks]) -> slurp_dlist([Tk | All], NewOpen, NewTks); % happy: open delimiters getting pushed slurp_dlist(All, Opens, [#tk{str = "("} = Tk | NewTks]) -> slurp_dlist([Tk | All], [Tk | Opens], NewTks); slurp_dlist(All, Opens, [#tk{str = "["} = Tk | NewTks]) -> slurp_dlist([Tk | All], [Tk | Opens], NewTks); slurp_dlist(All, Opens, [#tk{str = "{"} = Tk | NewTks]) -> slurp_dlist([Tk | All], [Tk | Opens], NewTks); % sad: mismatch cases slurp_dlist(All, Opens, []) -> {error, {fixme, mismatch, Opens, none}}; slurp_dlist(All, Opens, [#tk{str = "}"} = BadClose | _]) -> {error, {fixme, mismatch, Opens, {value, BadClose}}}; slurp_dlist(All, Opens, [#tk{str = "]"} = BadClose | _]) -> {error, {fixme, mismatch, Opens, {value, BadClose}}}; slurp_dlist(All, Opens, [#tk{str = ")"} = BadClose | _]) -> {error, {fixme, mismatch, Opens, {value, BadClose}}}; % general case: non-terminal token gets pushed slurp_dlist(All, Opens, [Tk | NewTks]) -> slurp_dlist([Tk | All], Opens, NewTks). %------------------------------------------------------- % API: meta info % % This is parse order definition, list of keywords, etc % % -export([ % token_shapes_parse_order/0, % kwds/0 % ]). %------------------------------------------------------- -spec token_shapes_parse_order() -> [tk_shape()]. % @doc % list of sophia token shapes in parse order (if an earlier shape matches, the later % shape isn't even checked) % % % Rules = % %% Comments and whitespace % [ CommentStart % , {"//.*", skip()} % , {WS, skip()} % % %% Special characters % , {"\\.\\.|[,.;()\\[\\]{}]", symbol()} % % %% Literals % , {CHAR, token(char, fun parse_char/1)} % , {STRING, token(string, fun parse_string/1)} % , {HEX, token(hex, fun parse_hex/1)} % , {INT, token(int, fun parse_int/1)} % , {BYTES, token(bytes, fun parse_bytes/1)} % % %% Identifiers (qualified first!) % , {QID, token(qid, fun(S) -> string:tokens(S, ".") end)} % , {QCON, token(qcon, fun(S) -> string:tokens(S, ".") end)} % , {TVAR, token(tvar)} % , override({ID, token(id)}, {KW, symbol()}) %% Keywords override identifiers. Need to % , {CON, token(con)} %% use override to avoid lexing "lettuce" % %% as ['let', {id, "tuce"}]. % %% Operators % , {OP, symbol()} % ], % @end token_shapes_parse_order() -> % written in this style to be maximally editable lists:flatten([ % comments and whitespace lcom, bcom, ws, punct, % literals char, string, int16, int10, bytes, ak, ct, sg, % qualified names need to go ahead of unqualifieds qid, qcon, tvar, % keywords need to be parsed ahead of ids kwd, id, con, % ops [=, =>, >>], punctuation (parens/braces) op ]). -spec kwds() -> list(string()). % @doc list of sophia kwds kwds() -> ["contract", "include", "let", "switch", "type", "record", "datatype", "if", "elif", "else", "function", "stateful", "payable", "true", "false", "mod", "public", "entrypoint", "private", "indexed", "namespace", "interface", "main", "using", "as", "for", "hiding", "band", "bor", "bxor", "bnot"]. %------------------------------------------------------- % API: token slurping % % -export([ % tokens/1, % slurp_token/1, % slurp_token_shapes/2, % slurp_token_of_shape/2 % ]). %------------------------------------------------------- % Token accessors -spec indent_level(tk()) -> pos_integer(). indent_level(#tk{pos = {_, IndentLevel}}) -> IndentLevel. -spec significant_tokens(SrcStr) -> Result when SrcStr :: iolist(), Result :: {ok, Tokens} | {error, gsc_err()}, Tokens :: [tk()]. significant_tokens(SrcStr) -> case tokens(SrcStr) of {ok, Tokens} -> {ok, filter_significant(Tokens)}; Error -> Error end. -spec filter_significant(Tokens) -> SignificantTokens when Tokens :: [tk()], SignificantTokens :: Tokens. filter_significant(Tokens) -> lists:filter(fun is_significant/1, Tokens). -spec is_significant(Token) -> boolean() when Token :: tk(). is_significant(#tk{shape = bcom}) -> false; is_significant(#tk{shape = lcom}) -> false; is_significant(#tk{shape = ws}) -> false; is_significant(_) -> true. -spec tokens_from_iolist(SrcStr) -> Result when SrcStr :: iolist(), Result :: {ok, Tokens} | {error, gsc_err()}, Tokens :: [tk()]. % @doc alias for tokens/1 tokens_from_iolist(S) -> tokens(S). -spec tokens(SrcStr) -> Result when SrcStr :: iolist(), Result :: {ok, Tokens} | {error, gsc_err()}, Tokens :: [tk()]. % @doc % Recursively parse all tokens off the front end of the string. `Rest' is % the first tail of the string for which no token parser succeeds. % % Semantically, `Rest'` being nonempty amounts to the presence of an illegal % character. tokens(S) -> % defensive normalization tokens([], {1, 1}, unicode:characters_to_nfc_list(S)). tokens(Stack, _FinalPos, "") -> {ok, lists:reverse(Stack)}; tokens(Stack, Pos, SrcStr) -> case slurp_token(Pos, SrcStr) of {tokmatch, NewToken = #tk{str = TokStr}, NewSrcStr} -> NewPos = new_pos(Pos, TokStr), tokens([NewToken | Stack], NewPos, NewSrcStr); no_tokmatch -> PrevTokens = lists:reverse(Stack), Err = #gsc_err_no_tokmatch{prev_tokens = PrevTokens, break_pos = Pos, rest = SrcStr}, {error, Err}; % FIXME so_scan bad % this is so fucking stupid % so_scan for some reason allows unterminated block comments at % the end of files % % for now we're just going to agree with so_scan {ierr, unterminated_block_comment} -> PrevTokens = lists:reverse(Stack), Err = #gsc_err_bcom_unterminated{prev_tokens = PrevTokens, break_pos = Pos, rest = SrcStr}, {error, Err}; Error = {error, _} -> Error end. % alright some bullshit here % % we're computing the line/column position of each string % % however this is meant to be compatible with so_scan, so it's a bit wonky % because regex list bullshit. % % recall that so_scan operates on the list representation of the utf-8 encoded % bytes; this is different than on a list of bignum codepoints (e.g. % unicode:characters_to_nfc_list(Bytes)); let's suppose some stupid complicated % foreign character which a sane language would simply criminalize has list % representation [ABC], but byte representation <> % % as far as so_scan is concerned, this means the character ABC consumes 3 % columns. the only exception is tab characters, which always fast-forward to % the next tab stop, which is 1-indexed because god hates all of us % % so the tab-stops are % 1 9 17 25 33 ... % % column position is determined in all cases by byte order, EXCEPT for $\t % which goes to the next tab stop % % so in general, for the token string, we need to convert to bytes first, % then handle `\t` bytes as a special case % % again in the tokenizer context, we're assuming that the input to our % tokenizer is an nfc-list which has a flat list of each unicode character in % codepoint form % % here we're just converting it to byte form, then computing columns based on % bytes new_pos(OldPos, TokStr) -> new_pos_bytes(unicode:characters_to_binary(TokStr), OldPos). % newline just goes to {L+1, 1} new_pos_bytes(<<$\n:8, Rest/bytes>>, _Pos = {L, _}) -> NewPos = {L+1, 1}, new_pos_bytes(Rest, NewPos); new_pos_bytes(<<$\t:8, Rest/bytes>>, _Pos = {Linum, Colnum1}) -> % stinky wet sweaty robots need 1-based indexing % so tab stops are at % 1 9 17 25 % super awesome dry silicon robots use 0-based indexing % so tab stops are at % 0 8 16 25 Colnum0 = Colnum1 - 1, % 0 based is based NextTabstop0 = next_tabstop8(Colnum0), NextTabstop1 = NextTabstop0 + 1, NextPos = {Linum, NextTabstop1}, new_pos_bytes(Rest, NextPos); new_pos_bytes(<<_:8, Rest/bytes>>, _Pos = {Linum, Colnum1}) -> % in general advance by 1 new_pos_bytes(Rest, {Linum, Colnum1 + 1}); new_pos_bytes(<<>>, FinalPos) -> FinalPos. % 0 8 16 24 etc % 0*8 1*8 2*8 3*8 etc next_tabstop8(Col0) when Col0 >= 0 -> % Col0 = PrevTabQ*8 + PrevTabR PrevTabQ = Col0 div 8, PrevTabR = Col0 rem 8, Col0 = PrevTabQ*8 + PrevTabR, NextTabQ = PrevTabQ + 1, NextTabCol0 = NextTabQ*8, NextTabCol0. %% copied from so_scan_lib.erl just to match behavior %-define(TAB_SIZE, 8). % %next_pos([], P) -> P; %next_pos([$\n | S], {L, _}) -> next_pos(S, {L + 1, 1}); %next_pos([$\t | S], {L, C}) -> next_pos(S, {L, (C + ?TAB_SIZE - 1) div ?TAB_SIZE * ?TAB_SIZE + 1}); %next_pos([_ | S], {L, C}) -> next_pos(S, {L, C + 1}). -spec slurp_token(Pos, SrcStr) -> Result when Pos :: tk_pos(), SrcStr :: string(), Result :: {tokmatch, Token, Rest} | no_tokmatch | {error, gsc_err()} | {ierr, unterminated_block_comment}, Token :: tk(), Rest :: string(). % @doc % grab a single token off the front of the string according to % `token_shapes_parse_order/0' slurp_token(Pos, SrcStr) -> % this is the easiest format if i need to fuck with it slurp_token_shapes(token_shapes_parse_order(), Pos, SrcStr). -spec slurp_token_shapes(ParseOrder, Pos, SrcStr) -> Result when ParseOrder :: [tk_shape()], Pos :: tk_pos(), SrcStr :: string(), Result :: {tokmatch, Token, Rest} | no_tokmatch | {error, gsc_err()} | {ierr, unterminated_block_comment}, Token :: tk(), Rest :: string(). % @doc % grab a single token off the front of the string according to % `token_shapes_parse_order/0' slurp_token_shapes([TokenType | TTs], Pos, SrcStr) -> case slurp_token_of_shape(TokenType, Pos, SrcStr) of Match = {tokmatch, _, _} -> Match; no_tokmatch -> slurp_token_shapes(TTs, Pos, SrcStr); IErr = {ierr, _} -> IErr; Error = {error, _} -> Error end; slurp_token_shapes([], _Pos, _SrcStr) -> no_tokmatch. -spec slurp_token_of_shape(TokenType, Pos, SrcStr) -> MaybeToken when TokenType :: tk_shape(), Pos :: tk_pos(), SrcStr :: string(), MaybeToken :: {tokmatch, Token, Rest} | no_tokmatch | {error, gsc_err()} | {ierr, unterminated_block_comment}, Token :: tk(), Rest :: string(). % @doc % match a sophia token of a given shape off the front of the string % @end % COMMENTS AND WHITESPACE: lcom, bcom, ws % % sophia line comment % % i am not going to bother writing a string matcher thing for this % FIXME: make a string matcher for line comments slurp_token_of_shape(lcom, Pos, SrcStr) -> case SrcStr of "//" ++ _ -> {Line, Rest} = takeline("", SrcStr), Token = #tk{shape = lcom, pos = Pos, str = Line}, {tokmatch, Token, Rest}; _ -> no_tokmatch end; % Block comments cannot have a string matcher because they have a whole stack % thing keeping track of depth because of nested block comments slurp_token_of_shape(bcom, Pos, SrcStr0) -> case SrcStr0 of "/*" ++ SrcStr1 -> case bcom("/*", 1, SrcStr1) of {ok, CommentStr, SrcStr2} -> Token = #tk{shape = bcom, pos = Pos, str = CommentStr}, {tokmatch, Token, SrcStr2}; Error -> Error end; _ -> no_tokmatch end; slurp_token_of_shape(ws, Pos, SrcStr) -> WhitespaceMatcher = gs_strmatch:smr_sf_ws(), case gs_strmatch:match(WhitespaceMatcher, SrcStr) of no_strmatch -> no_tokmatch; {strmatch, WS, Rest} -> Token = #tk{shape = ws, pos = Pos, str = WS}, {tokmatch, Token, Rest} end; % KEYWORDS, OPERATORS, PUNCTUATION: kwd, op, punct % % all the kwds are valid ids, so we match as an id and then check if it's a % kwd % % kwds are allowed to be prefixes for user-defined variable names; e.g. % "lettuce" should be parsed as an id, not as ["let", "tuce"]; for this reason % we need to be careful with greedily parsing kwds % % we know kwds are always ids, so we parse it as an id and see if it's one % of the kwds slurp_token_of_shape(kwd, Pos, SrcStr) -> case slurp_token_of_shape(id, Pos, SrcStr) of {tokmatch, IdTok = #tk{str = IdStr}, Rest} -> case lists:member(IdStr, kwds()) of false -> no_tokmatch; true -> KwTok = IdTok#tk{shape = kwd}, {tokmatch, KwTok, Rest} end; no_tokmatch -> no_tokmatch end; slurp_token_of_shape(op, Pos, SrcStr) -> case gs_strmatch:match(gs_strmatch:smr_sf_op(), SrcStr) of {strmatch, Str, Rest} -> Token = #tk{shape = op, pos = Pos, str = Str}, {tokmatch, Token, Rest}; no_strmatch -> no_tokmatch end; slurp_token_of_shape(punct, Pos, SrcStr) -> case gs_strmatch:match(gs_strmatch:smr_sf_punct(), SrcStr) of {strmatch, Str, Rest} -> Token = #tk{shape = punct, pos = Pos, str = Str}, {tokmatch, Token, Rest}; no_strmatch -> no_tokmatch end; % SOPHIA VARIABLE NAMES: id, con, qid, qcon, tvar slurp_token_of_shape(id, Pos, SrcStr) -> case gs_strmatch:match(gs_strmatch:smr_sf_id(), SrcStr) of {strmatch, IdStr, Rest} -> Token = #tk{shape = id, pos = Pos, str = IdStr}, {tokmatch, Token, Rest}; no_strmatch -> no_tokmatch end; slurp_token_of_shape(con, Pos, SrcStr) -> case gs_strmatch:match(gs_strmatch:smr_sf_con(), SrcStr) of {strmatch, Str, Rest} -> Token = #tk{shape = con, pos = Pos, str = Str}, {tokmatch, Token, Rest}; no_strmatch -> no_tokmatch end; slurp_token_of_shape(qid, Pos, SrcStr) -> case gs_strmatch:match(gs_strmatch:smr_sf_qid(), SrcStr) of {strmatch, Str, Rest} -> Token = #tk{shape = qid, pos = Pos, str = Str}, {tokmatch, Token, Rest}; no_strmatch -> no_tokmatch end; slurp_token_of_shape(qcon, Pos, SrcStr) -> case gs_strmatch:match(gs_strmatch:smr_sf_qcon(), SrcStr) of {strmatch, Str, Rest} -> Token = #tk{shape = qcon, pos = Pos, str = Str}, {tokmatch, Token, Rest}; no_strmatch -> no_tokmatch end; slurp_token_of_shape(tvar, Pos, SrcStr) -> case gs_strmatch:match(gs_strmatch:smr_sf_tvar(), SrcStr) of {strmatch, Str, Rest} -> Token = #tk{shape = tvar, pos = Pos, str = Str}, {tokmatch, Token, Rest}; no_strmatch -> no_tokmatch end; slurp_token_of_shape(int16, Pos, SrcStr) -> case gs_strmatch:match(gs_strmatch:smr_sf_int16(), SrcStr) of {strmatch, Str, Rest} -> Token = #tk{shape = int16, pos = Pos, str = Str}, {tokmatch, Token, Rest}; no_strmatch -> no_tokmatch end; slurp_token_of_shape(int10, Pos, SrcStr) -> case gs_strmatch:match(gs_strmatch:smr_sf_int10(), SrcStr) of {strmatch, Str, Rest} -> Token = #tk{shape = int10, pos = Pos, str = Str}, {tokmatch, Token, Rest}; no_strmatch -> no_tokmatch end; % LITERAL PARSERS: char, string, hex, int, bytes10, bytes16, % ak, ct, sg % % char: sophia char literal slurp_token_of_shape(ak, Pos, SrcStr) -> StringMatcher = gs_strmatch:smr_sf_ak(), case gs_strmatch:match(StringMatcher, SrcStr) of no_strmatch -> no_tokmatch; {strmatch, TokenStr, Rest} -> Token = #tk{shape = ak, pos = Pos, str = TokenStr}, {tokmatch, Token, Rest} end; slurp_token_of_shape(ct, Pos, SrcStr) -> StringMatcher = gs_strmatch:smr_sf_ct(), case gs_strmatch:match(StringMatcher, SrcStr) of no_strmatch -> no_tokmatch; {strmatch, TokenStr, Rest} -> Token = #tk{shape = ct, pos = Pos, str = TokenStr}, {tokmatch, Token, Rest} end; slurp_token_of_shape(sg, Pos, SrcStr) -> StringMatcher = gs_strmatch:smr_sf_sg(), case gs_strmatch:match(StringMatcher, SrcStr) of no_strmatch -> no_tokmatch; {strmatch, TokenStr, Rest} -> Token = #tk{shape = sg, pos = Pos, str = TokenStr}, {tokmatch, Token, Rest} end; slurp_token_of_shape(char, Pos, SrcStr) -> StringMatcher = gs_strmatch:smr_sf_char(), case gs_strmatch:match(StringMatcher, SrcStr) of no_strmatch -> no_tokmatch; {strmatch, TokenStr, Rest} -> Token = #tk{shape = char, pos = Pos, str = TokenStr}, {tokmatch, Token, Rest} end; slurp_token_of_shape(string, Pos, SrcStr) -> case gs_strmatch:match(gs_strmatch:smr_sf_str(), SrcStr) of no_strmatch -> no_tokmatch; {strmatch, TokenStr, Rest} -> Token = #tk{shape = string, pos = Pos, str = TokenStr}, {tokmatch, Token, Rest} end; slurp_token_of_shape(bytes, Pos, SrcStr) -> case gs_strmatch:match(gs_strmatch:smr_sf_bytes(), SrcStr) of no_strmatch -> no_tokmatch; {strmatch, TokenStr, Rest} -> Token = #tk{shape = bytes, pos = Pos, str = TokenStr}, {tokmatch, Token, Rest} end; slurp_token_of_shape(NyiType, Pos, SrcStr) -> Message = io_lib:format("cannot slurp token of shape: ~p", [NyiType]), error(#gsc_err{atom = nyi, str = Message, extra = [{token_shape, NyiType}, {pos, Pos}, {rest, SrcStr}]}). takeline(Acc, "") -> {lists:reverse(Acc), ""}; takeline(Acc, Rest = "\n" ++ _) -> {lists:reverse(Acc), Rest}; takeline(Acc, [C | Rest]) -> takeline([C | Acc], Rest). bcom(CommentStr, Depth, SrcStr0) when Depth > 0 -> case SrcStr0 of % premature end "" -> {ierr, unterminated_block_comment}; % decrease depth "*/" ++ SrcStr1 -> NewCommentStr = [CommentStr, "*/"], NewDepth = Depth - 1, bcom(NewCommentStr, NewDepth, SrcStr1); % increase depth "/*" ++ SrcStr1 -> NewCommentStr = [CommentStr, "/*"], NewDepth = Depth + 1, bcom(NewCommentStr, NewDepth, SrcStr1); % same depth, add to list [C | SrcStr1] -> NewCommentStr = [CommentStr, C], bcom(NewCommentStr, Depth, SrcStr1) end; bcom(CommentStr, 0, SrcStr) -> {ok, unicode:characters_to_nfc_list(CommentStr), SrcStr}. %------------------------------------------ % INTERNAL UTILITIES %------------------------------------------ -spec take_while(Pred, List) -> {Taken, Rest} when Pred :: fun((Item) -> boolean()), List :: [Item], Taken :: List, Rest :: List. % @doc similar to lists:takewhile but returns {Taken, Rest}. Name is % to remind you it returns 2 things. take_while(Pred, List) -> take_while(Pred, [], List). -spec take_while(Pred, Prefix, List) -> {Taken, Rest} when Pred :: fun((Item) -> boolean()), Prefix :: List, List :: [Item], Taken :: List, Rest :: List. % @doc % similar to takewhile_ii/2, but returns {Prefix ++ Taken, Rest} % % where Prefix % % middle argument is just the accum take_while(Pred, Pfx, List) -> tw3(Pred, lists:reverse(Pfx), List). tw3(Pred, Stk, [X | Xs]) -> case Pred(X) of true -> tw3(Pred, [X | Stk], Xs); false -> {lists:reverse(Stk), [X | Xs]} end; tw3(_, Stk, []) -> {lists:reverse(Stk), []}.