% \iffalse meta-comment % %% File: l3str.dtx Copyright (C) 2014 The LaTeX3 Project %% %% It may be distributed and/or modified under the conditions of the %% LaTeX Project Public License (LPPL), either version 1.3c of this %% license or (at your option) any later version. The latest version %% of this license is in the file %% %% http://www.latex-project.org/lppl.txt %% %% This file is part of the "l3kernel bundle" (The Work in LPPL) %% and all files in that bundle must be distributed together. %% %% The released version of this bundle is available from CTAN. %% %% ----------------------------------------------------------------------- %% %% The development version of the bundle can be found at %% %% http://www.latex-project.org/svnroot/experimental/trunk/ %% %% for those people who are interested. %% %%%%%%%%%%% %% NOTE: %% %%%%%%%%%%% %% %% Snapshots taken from the repository represent work in progress and may %% not work or may contain conflicting material! We therefore ask %% people _not_ to put them into distributions, archives, etc. without %% prior consultation with the LaTeX3 Project. %% %% ----------------------------------------------------------------------- % %<*driver> \documentclass[full]{l3doc} % %<*driver|package> \GetIdInfo$Id: l3str.dtx 5545 2015-03-01 21:24:44Z joseph $ {L3 Strings} % %<*driver> \begin{document} \DocInput{\jobname.dtx} \end{document} % % \fi % % \title{^^A % The \pkg{l3str} package\\Strings^^A % \thanks{This file describes v\ExplFileVersion, % last revised \ExplFileDate.}^^A % } % % \author{^^A % The \LaTeX3 Project\thanks % {^^A % E-mail: % \href{mailto:latex-team@latex-project.org} % {latex-team@latex-project.org}^^A % }^^A % } % % \date{Released \ExplFileDate} % % \maketitle % % \begin{documentation} % % \TeX{} associates each character with a category code: as such, there is no % concept of a \enquote{string} as commonly understood in many other % programming languages. However, there are places where we wish to manipulate % token lists while in some sense \enquote{ignoring} category codes: this is % done by treating token lists as strings in a \TeX{} sense. % % A \TeX{} string (and thus an \pkg{expl3} string) is a series of characters % which have category code $12$ (\enquote{other}) with the exception of % space characters which have category code $10$ (\enquote{space}). Thus % at a technical level, a \TeX{} string is a token list with the appropriate % category codes. In this documentation, these will simply be referred to as % strings: note that they can be stored in token lists as normal. % % The functions documented here take literal token lists, % convert to strings and then carry out manipulations. Thus they may % informally be described as \enquote{ignoring} category code. Note that % the functions \cs{cs_to_str:N}, \cs{tl_to_str:n}, \cs{tl_to_str:N} and % \cs{token_to_str:N} (and variants) will generate strings from the appropriate % input: these are documented in \pkg{l3basics}, \pkg{l3tl} and \pkg{l3token}, % respectively. % % \section{The first character from a string} % % \begin{function}[added = 2011-08-10, EXP]{\str_head:n,\str_tail:n} % \begin{syntax} % \cs{str_head:n} \Arg{token list} % \cs{str_tail:n} \Arg{token list} % \end{syntax} % Converts the \meta{token list} into a string, as described for % \cs{tl_to_str:n}. The \cs{str_head:n} function then leaves % the first character of this string in the input stream. % The \cs{str_tail:n} function leaves all characters except % the first in the input stream. The first character may be % a space. If the \meta{token list} argument is entirely empty, % nothing is left in the input stream. % \end{function} % % \subsection{Tests on strings} % % \begin{function}[EXP,pTF] % { % \str_if_eq:nn, \str_if_eq:Vn, \str_if_eq:on, \str_if_eq:no, % \str_if_eq:nV, \str_if_eq:VV % } % \begin{syntax} % \cs{str_if_eq_p:nn} \Arg{tl_1} \Arg{tl_2} % \cs{str_if_eq:nnTF} \Arg{tl_1} \Arg{tl_2} \Arg{true code} \Arg{false code} % \end{syntax} % Compares the two \meta{token lists} on a character by character % basis, and is \texttt{true} if the two lists contain the same % characters in the same order. Thus for example % \begin{verbatim} % \str_if_eq_p:no { abc } { \tl_to_str:n { abc } } % \end{verbatim} % is logically \texttt{true}. % \end{function} % % \begin{function}[EXP,pTF, added = 2012-06-05]{\str_if_eq_x:nn} % \begin{syntax} % \cs{str_if_eq_x_p:nn} \Arg{tl_1} \Arg{tl_2} % \cs{str_if_eq_x:nnTF} \Arg{tl_1} \Arg{tl_2} \Arg{true code} \Arg{false code} % \end{syntax} % Compares the full expansion of two \meta{token lists} on a character by % character basis, and is \texttt{true} if the two lists contain the same % characters in the same order. Thus for example % \begin{verbatim} % \str_if_eq_x_p:nn { abc } { \tl_to_str:n { abc } } % \end{verbatim} % is logically \texttt{true}. % \end{function} % % \begin{function}[added = 2013-07-24, updated = 2015-02-28, EXP, TF] % {\str_case:nn, \str_case:on, \str_case:nV, \str_case:nv} % \begin{syntax} % \cs{str_case:nnTF} \Arg{test string} \\ % ~~|{| \\ % ~~~~\Arg{string case_1} \Arg{code case_1} \\ % ~~~~\Arg{string case_2} \Arg{code case_2} \\ % ~~~~\ldots \\ % ~~~~\Arg{string case_n} \Arg{code case_n} \\ % ~~|}| \\ % ~~\Arg{true code} % ~~\Arg{false code} % \end{syntax} % This function compares the \meta{test string} in turn with each % of the \meta{string cases}. If the two are equal (as described for % \cs{str_if_eq:nnTF} then the % associated \meta{code} is left in the input stream. If any of the % cases are matched, the \meta{true code} is also inserted into the % input stream (after the code for the appropriate case), while if none % match then the \meta{false code} is inserted. The function % \cs{str_case:nn}, which does nothing if there is no match, is also % available. % \end{function} % % \begin{function}[added = 2013-07-24, EXP, TF]{\str_case_x:nn} % \begin{syntax} % \cs{str_case_x:nnF} \Arg{test string} \\ % ~~|{| \\ % ~~~~\Arg{string case_1} \Arg{code case_1} \\ % ~~~~\Arg{string case_2} \Arg{code case_2} \\ % ~~~~\ldots \\ % ~~~~\Arg{string case_n} \Arg{code case_n} \\ % ~~|}| \\ % ~~\Arg{true code} % ~~\Arg{false code} % \end{syntax} % This function compares the full expansion of the \meta{test string} % in turn with the full expansion of the \meta{string cases}. If the two % full expansions are equal (as described for \cs{str_if_eq:nnTF} then the % associated \meta{code} is left in the input stream. If any of the % cases are matched, the \meta{true code} is also inserted into the % input stream (after the code for the appropriate case), while if none % match then the \meta{false code} is inserted. The function % \cs{str_case_x:nn}, which does nothing if there is no match, is also % available. % The \meta{test string} is expanded in each comparison, and must % always yield the same result: for example, random numbers must % not be used within this string. % \end{function} % % \section{String manipulation} % % \begin{function}[rEXP, added = 2015-03-01] % { % \str_lower_case:n, \str_lower_case:f, % \str_upper_case:n, \str_upper_case:f % } % \begin{syntax} % \cs{str_lower_case:n} \Arg{tokens} % \cs{str_upper_case:n} \Arg{tokens} % \end{syntax} % Converts the input \meta{tokens} to their string representation, as % described for \cs{tl_to_str:n}, and then to the lower or upper % case representation using a one-to-one mapping as described by the % Unicode Consortium file |UnicodeData.txt|. % % These functions are intended for case changing programmatic data in % places where upper/lower case distinctions are meaningful. One example % would be automatically generating a function name from user input where % some case changing is needed. In this situation the input is programmatic, % not textual, case does have meaning and a language-independent one-to-one % mapping is appropriate. For example % \begin{verbatim} % \cs_new_protected:Npn \myfunc:nn #1#2 % { % \cs_set_protected:cpn % { % user % \str_upper_case:f { \tl_head:n {#1} } % \str_lower_case:f { \tl_tail:n {#1} } % } % { #2 } % } % \end{verbatim} % would be used to generate a function with an auto-generated name consisting % of the upper case equivalent of the supplied name followed by the lower % case equivalent of the rest of the input. % % These functions should \emph{not} be used for % \begin{itemize} % \item Caseless comparisons: use \cs{str_fold_case:n} for this % situation (case folding is district from lower casing). % \item Case changing text for typesetting: see the \cs{tl_lower_case:n(n)}, % \cs{tl_upper_case:n(n)} and \cs{tl_mixed_case:n(n)} functions which % correctly deal with context-dependence and other factors appropriate % to text case changing. % \end{itemize} % % \begin{texnote} % As with all \pkg{expl3} functions, the input supported by % \cs{str_fold_case:n} is \emph{engine-native} characters which are or % interoperate with \textsc{utf-8}. As such, when used with \pdfTeX{} % \emph{only} the Latin alphabet characters A--Z will be case-folded % (\emph{i.e.}~the \textsc{ascii} range which coincides with % \textsc{utf-8}). Full \textsc{utf-8} support is available with both % \XeTeX{} and \LuaTeX{}, subject only to the fact that \XeTeX{} in % particular has issues with characters of code above hexadecimal % $0\mathrm{xFFF}$ when interacting with \cs{tl_to_str:n}. % \end{texnote} % \end{function} % % \begin{function}[rEXP, added = 2014-06-19, updated = 2015-03-01] % {\str_fold_case:n} % \begin{syntax} % \cs{str_fold_case:n} \Arg{tokens} % \end{syntax} % Converts the input \meta{tokens} to their string representation, as % described for \cs{tl_to_str:n}, and then folds the case of the resulting % \meta{string} to remove case information. The result of this process is % left in the input stream. % % String folding is a process used for material such as identifiers rather % than for \enquote{text}. The folding provided by \cs{str_fold_case:n} % follows the mappings provided by the \href{http://www.unicode.org}^^A % {Unicode Consortium}, who % \href{http://www.unicode.org/faq/casemap_charprop.html#2}{state}: % \begin{quote} % Case folding is primarily used for caseless comparison of text, such % as identifiers in a computer program, rather than actual text % transformation. Case folding in Unicode is based on the lowercase % mapping, but includes additional changes to the source text to help make % it language-insensitive and consistent. As a result, case-folded text % should be used solely for internal processing and generally should not be % stored or displayed to the end user. % \end{quote} % The folding approach implemented by \cs{str_fold_case:n} follows the % \enquote{full} scheme defined by the Unicode Consortium % (\emph{e.g.}~\SS folds to \texttt{SS}). As case-folding is % a language-insensitive process, there is no special treatment of % Turkic input (\emph{i.e.}~\texttt{I} always folds to \texttt{i} and % not to \texttt{\i}). % % \begin{texnote} % As with all \pkg{expl3} functions, the input supported by % \cs{str_fold_case:n} is \emph{engine-native} characters which are or % interoperate with \textsc{utf-8}. As such, when used with \pdfTeX{} % \emph{only} the Latin alphabet characters A--Z will be case-folded % (\emph{i.e.}~the \textsc{ascii} range which coincides with % \textsc{utf-8}). Full \textsc{utf-8} support is available with both % \XeTeX{} and \LuaTeX{}, subject only to the fact that \XeTeX{} in % particular has issues with characters of code above hexadecimal % $0\mathrm{xFFF}$ when interacting with \cs{tl_to_str:n}. % \end{texnote} % \end{function} % % \subsection{Internal string functions} % % \begin{function}[EXP]{\__str_if_eq_x:nn} % \begin{syntax} % \cs{__str_if_eq_x:nn} \Arg{tl_1} \Arg{tl_2} % \end{syntax} % Compares the full expansion of two \meta{token lists} on a character by % character basis, and is \texttt{true} if the two lists contain the same % characters in the same order. Leaves |0| in the input stream if the % condition is true, and |+1| or |-1| otherwise. % \end{function} % % \begin{function}{\__str_if_eq_x_return:nn} % \begin{syntax} % \cs{__str_if_eq_x_return:nn} \Arg{tl_1} \Arg{tl_2} % \end{syntax} % Compares the full expansion of two \meta{token lists} on a character by % character basis, and is \texttt{true} if the two lists contain the same % characters in the same order. Either \cs{prg_return_true:} or % \cs{prg_return_false:} is then left in the input stream. This is a version % of \cs{str_if_eq_x:nn(TF)} coded for speed. % \end{function} % % \end{documentation} % % \begin{implementation} % % \section{\pkg{l3str} implementation} % % \begin{macrocode} %<*initex|package> % \end{macrocode} % % \begin{macrocode} %<@@=str> % \end{macrocode} % % \begin{macro}{\str_head:n, \str_tail:n} % \begin{macro}[aux]{\__str_head:w} % \begin{macro}[aux]{\__str_tail:w} % After \cs{tl_to_str:n}, we have a list of character tokens, % all with category code 12, except the space, which has category % code 10. Directly using \cs{tl_head:w} would thus lose leading spaces. % Instead, we take an argument delimited by an explicit space, and % then only use \cs{tl_head:w}. If the string started with a % space, then the argument of \cs{__str_head:w} is empty, and % the function correctly returns a space character. Otherwise, % it returns the first token of |#1|, which is the first token % of the string. If the string is empty, we return an empty result. % % To remove the first character of \cs{tl_to_str:n} |{#1}|, % we test it using \cs{if_charcode:w} \cs{scan_stop:}, % always \texttt{false} for characters. If the argument was non-empty, % then \cs{__str_tail:w} returns everything until the first % \texttt{X} (with category code letter, no risk of confusing % with the user input). If the argument was empty, the first % \texttt{X} is taken by \cs{if_charcode:w}, and nothing % is returned. We use \texttt{X} as a \meta{marker}, rather than % a quark because the test \cs{if_charcode:w} \cs{scan_stop:} % \meta{marker} has to be \texttt{false}. % \begin{macrocode} \cs_new:Npn \str_head:n #1 { \exp_after:wN \__str_head:w \tl_to_str:n {#1} { { } } ~ \q_stop } \cs_new:Npn \__str_head:w #1 ~ % { \tl_head:w #1 { ~ } } \cs_new:Npn \str_tail:n #1 { \exp_after:wN \__str_tail:w \reverse_if:N \if_charcode:w \scan_stop: \tl_to_str:n {#1} X X \q_stop } \cs_new:Npn \__str_tail:w #1 X #2 \q_stop { \fi: #1 } % \end{macrocode} % \end{macro} % \end{macro} % \end{macro} % % \subsection{String comparisons} % % \begin{macro}[int, EXP]{\__str_if_eq_x:nn} % \begin{macro}[aux, EXP]{\__str_escape_x:n} % String comparisons rely on the primitive \cs{(pdf)strcmp} if available: % \LuaTeX{} does not have it, so emulation is required. As the net result % is that we do not \emph{always} use the primitive, the correct approach % is to wrap up in a function with defined behaviour. That's done by % providing a wrapper and then redefining in the \LuaTeX{} case. Note that % the necessary Lua code is covered in \pkg{l3boostrap}: long-term this may % need to go into a separate Lua file, but at present it's somewhere that % spaces are not skipped for ease-of-input. The need to detokenize and force % expansion of input arises from the case where a |#| token is used in the % input, \emph{e.g.}~|\__str_if_eq_x:nn {#} { \tl_to_str:n {#} }|, which % otherwise will fail as \cs{luatex_luaescapestring:D} does not double % such tokens. % \begin{macrocode} \cs_new:Npn \__str_if_eq_x:nn #1#2 { \pdftex_strcmp:D {#1} {#2} } \luatex_if_engine:T { \cs_set:Npn \__str_if_eq_x:nn #1#2 { \luatex_directlua:D { l3kernel.strcmp ( " \__str_escape_x:n {#1} " , " \__str_escape_x:n {#2} " ) } } \cs_new:Npn \__str_escape_x:n #1 { \luatex_luaescapestring:D { \etex_detokenize:D \exp_after:wN { \luatex_expanded:D {#1} } } } } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}[int, EXP]{\__str_if_eq_x_return:nn} % It turns out that we often need to compare a token list % with the result of applying some function to it, and % return with \cs{prg_return_true/false:}. This test is % similar to \cs{str_if_eq:nnTF} (see \pkg{l3str}), % but is hard-coded for speed. % \begin{macrocode} \cs_new:Npn \__str_if_eq_x_return:nn #1 #2 { \if_int_compare:w \__str_if_eq_x:nn {#1} {#2} = \c_zero \prg_return_true: \else: \prg_return_false: \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}[pTF, EXP] % { % \str_if_eq:nn, \str_if_eq:Vn, \str_if_eq:on, \str_if_eq:nV, % \str_if_eq:no, \str_if_eq:VV, % \str_if_eq_x:nn % } % Modern engines provide a direct way of comparing two token lists, % but returning a number. This set of conditionals therefore make life % a bit clearer. The \texttt{nn} and \texttt{xx} versions are created % directly as this is most efficient. % \begin{macrocode} \prg_new_conditional:Npnn \str_if_eq:nn #1#2 { p , T , F , TF } { \if_int_compare:w \__str_if_eq_x:nn { \exp_not:n {#1} } { \exp_not:n {#2} } = \c_zero \prg_return_true: \else: \prg_return_false: \fi: } \cs_generate_variant:Nn \str_if_eq_p:nn { V , o } \cs_generate_variant:Nn \str_if_eq_p:nn { nV , no , VV } \cs_generate_variant:Nn \str_if_eq:nnT { V , o } \cs_generate_variant:Nn \str_if_eq:nnT { nV , no , VV } \cs_generate_variant:Nn \str_if_eq:nnF { V , o } \cs_generate_variant:Nn \str_if_eq:nnF { nV , no , VV } \cs_generate_variant:Nn \str_if_eq:nnTF { V , o } \cs_generate_variant:Nn \str_if_eq:nnTF { nV , no , VV } \prg_new_conditional:Npnn \str_if_eq_x:nn #1#2 { p , T , F , TF } { \if_int_compare:w \__str_if_eq_x:nn {#1} {#2} = \c_zero \prg_return_true: \else: \prg_return_false: \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}[int, EXP]{\@@_if_eq_x_return:nn} % \end{macro} % % \begin{macro}[EXP] % {\str_case:nn, \str_case:on, \str_case:nV, \str_case:nv, \str_case_x:nn} % \begin{macro}[EXP, TF] % {\str_case:nn, \str_case:on, \str_case:nV, \str_case:nv, \str_case_x:nn} % \begin{macro}[EXP, aux]{\@@_case:nnTF, \@@_case_x:nnTF} % \begin{macro}[aux, EXP] % {\@@_case:nw, \@@_case_x:nw, \@@_case_end:nw} % Much the same as \cs{tl_case:nn(TF)} here: just a change in the internal % comparison. % \begin{macrocode} \cs_new:Npn \str_case:nn #1#2 { \tex_romannumeral:D \@@_case:nnTF {#1} {#2} { } { } } \cs_new:Npn \str_case:nnT #1#2#3 { \tex_romannumeral:D \@@_case:nnTF {#1} {#2} {#3} { } } \cs_new:Npn \str_case:nnF #1#2 { \tex_romannumeral:D \@@_case:nnTF {#1} {#2} { } } \cs_new:Npn \str_case:nnTF #1#2 { \tex_romannumeral:D \@@_case:nnTF {#1} {#2} } \cs_new:Npn \@@_case:nnTF #1#2#3#4 { \@@_case:nw {#1} #2 {#1} { } \q_mark {#3} \q_mark {#4} \q_stop } \cs_generate_variant:Nn \str_case:nn { o , nV , nv } \cs_generate_variant:Nn \str_case:nnT { o , nV , nv } \cs_generate_variant:Nn \str_case:nnF { o , nV , nv } \cs_generate_variant:Nn \str_case:nnTF { o , nV , nv } \cs_new:Npn \@@_case:nw #1#2#3 { \str_if_eq:nnTF {#1} {#2} { \@@_case_end:nw {#3} } { \@@_case:nw {#1} } } \cs_new:Npn \str_case_x:nn #1#2 { \tex_romannumeral:D \@@_case_x:nnTF {#1} {#2} { } { } } \cs_new:Npn \str_case_x:nnT #1#2#3 { \tex_romannumeral:D \@@_case_x:nnTF {#1} {#2} {#3} { } } \cs_new:Npn \str_case_x:nnF #1#2 { \tex_romannumeral:D \@@_case_x:nnTF {#1} {#2} { } } \cs_new:Npn \str_case_x:nnTF #1#2 { \tex_romannumeral:D \@@_case_x:nnTF {#1} {#2} } \cs_new:Npn \@@_case_x:nnTF #1#2#3#4 { \@@_case_x:nw {#1} #2 {#1} { } \q_mark {#3} \q_mark {#4} \q_stop } \cs_new:Npn \@@_case_x:nw #1#2#3 { \str_if_eq_x:nnTF {#1} {#2} { \@@_case_end:nw {#3} } { \@@_case_x:nw {#1} } } \cs_new_eq:NN \@@_case_end:nw \__prg_case_end:nw % \end{macrocode} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % % \subsection{String manipulation} % % \begin{macro}[EXP] % { % \str_fold_case:n, % \str_lower_case:n, \str_lower_case:f, % \str_upper_case:n, \str_upper_case:f % } % \begin{macro}[aux, EXP]{\__str_change_case:nn} % \begin{macro}[aux, EXP]{\__str_change_case_aux:nn} % \begin{macro}[aux, EXP]{\__str_change_case_loop:nw} % \begin{macro}[aux, EXP]{\__str_change_case_space:n} % \begin{macro}[aux, EXP]{\__str_change_case_char:nN} % \begin{macro}[aux, EXP]{\__str_change_case_char:NNNNNNNNn} % Case changing for programmatic reasons is done by first detokenizing % input then doing a simple loop that only has to worry about spaces % and everything else. The output is detokenized to allow data sharing % with text-based case changing. The key idea here of splitting up the % data files based on the character code of the current character is % shared with the text case changer too. % \begin{macrocode} \cs_new:Npn \str_fold_case:n #1 { \__str_change_case:nn {#1} { fold } } \cs_new:Npn \str_lower_case:n #1 { \__str_change_case:nn {#1} { lower } } \cs_new:Npn \str_upper_case:n #1 { \__str_change_case:nn {#1} { upper } } \cs_generate_variant:Nn \str_lower_case:n { f } \cs_generate_variant:Nn \str_upper_case:n { f } \cs_new:Npn \__str_change_case:nn #1 { \exp_after:wN \__str_change_case_aux:nn \exp_after:wN { \tl_to_str:n {#1} } } \cs_new:Npn \__str_change_case_aux:nn #1#2 { \__str_change_case_loop:nw {#2} #1 \q_recursion_tail \q_recursion_stop } \cs_new:Npn \__str_change_case_loop:nw #1#2 \q_recursion_stop { \tl_if_head_is_space:nTF {#2} { \__str_change_case_space:n } { \__str_change_case_char:nN } {#1} #2 \q_recursion_stop } \use:x { \cs_new:Npn \exp_not:N \__str_change_case_space:n ##1 \c_space_tl } { \c_space_tl \__str_change_case_loop:nw {#1} } \cs_new:Npn \__str_change_case_char:nN #1#2 { \quark_if_recursion_tail_stop:N #2 \exp_args:Nf \tl_to_str:n { \exp_after:wN \__str_change_case_char:NNNNNNNNn \int_use:N \__int_eval:w 1000000 + `#2 \__int_eval_end: #2 {#1} } \__str_change_case_loop:nw {#1} } \cs_new:Npn \__str_change_case_char:NNNNNNNNn #1#2#3#4#5#6#7#8#9 { \str_case:nvF #8 { c__unicode_ #9 _ #6 _X_ #7 _tl } { #8 } } % \end{macrocode} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % % \subsection{Deprecated functions} % % \begin{macro}{\str_case:nnn, \str_case:onn, \str_case_x:nnn} % Deprecated 2013-07-15. % \begin{macrocode} \cs_new_eq:NN \str_case:nnn \str_case:nnF \cs_new_eq:NN \str_case:onn \str_case:onF \cs_new_eq:NN \str_case_x:nnn \str_case_x:nnF % \end{macrocode} % \end{macro} % % \begin{macrocode} % % \end{macrocode} % % \end{implementation} % % \PrintIndex