LCOV - code coverage report
Current view: top level - lib/util/charset - util_str.c (source / functions) Hit Total Coverage
Test: coverage report for master 98b443d9 Lines: 222 252 88.1 %
Date: 2024-05-31 13:13:24 Functions: 20 20 100.0 %

          Line data    Source code
       1             : /*
       2             :    Unix SMB/CIFS implementation.
       3             :    Samba utility functions
       4             :    Copyright (C) Andrew Tridgell 1992-2001
       5             :    Copyright (C) Simo Sorce 2001
       6             :    Copyright (C) Andrew Bartlett 2011
       7             :    Copyright (C) Jeremy Allison  1992-2007
       8             :    Copyright (C) Martin Pool     2003
       9             :    Copyright (C) James Peach     2006
      10             : 
      11             :    This program is free software; you can redistribute it and/or modify
      12             :    it under the terms of the GNU General Public License as published by
      13             :    the Free Software Foundation; either version 3 of the License, or
      14             :    (at your option) any later version.
      15             : 
      16             :    This program is distributed in the hope that it will be useful,
      17             :    but WITHOUT ANY WARRANTY; without even the implied warranty of
      18             :    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      19             :    GNU General Public License for more details.
      20             : 
      21             :    You should have received a copy of the GNU General Public License
      22             :    along with this program.  If not, see <http://www.gnu.org/licenses/>.
      23             : */
      24             : 
      25             : #include "replace.h"
      26             : #include "system/locale.h"
      27             : #include "charset.h"
      28             : #include "lib/util/fault.h"
      29             : #include "lib/util/tsort.h"
      30             : 
      31             : #ifdef strcasecmp
      32             : #undef strcasecmp
      33             : #endif
      34             : #ifdef strncasecmp
      35             : #undef strncasecmp
      36             : #endif
      37             : 
      38             : 
      39             : /**
      40             :  Case insensitive string comparison, handle specified for testing
      41             : **/
      42   351549898 : _PUBLIC_ int strcasecmp_m_handle(struct smb_iconv_handle *iconv_handle,
      43             :                                  const char *s1, const char *s2)
      44             : {
      45   351549898 :         codepoint_t c1=0, c2=0;
      46   351549898 :         codepoint_t u1=0, u2=0;
      47   351549898 :         codepoint_t l1=0, l2=0;
      48     2231315 :         size_t size1, size2;
      49             : 
      50             :         /* handle null ptr comparisons to simplify the use in qsort */
      51   351549898 :         if (s1 == s2) return 0;
      52   351549127 :         if (s1 == NULL) return -1;
      53   351549125 :         if (s2 == NULL) return 1;
      54             : 
      55  1174115955 :         while (*s1 && *s2) {
      56  1156218138 :                 c1 = next_codepoint_handle(iconv_handle, s1, &size1);
      57  1156218138 :                 c2 = next_codepoint_handle(iconv_handle, s2, &size2);
      58             : 
      59  1156218138 :                 if (c1 == INVALID_CODEPOINT ||
      60     3990302 :                     c2 == INVALID_CODEPOINT) {
      61           9 :                         return strcasecmp(s1, s2);
      62             :                 }
      63             : 
      64  1156218129 :                 s1 += size1;
      65  1156218129 :                 s2 += size2;
      66             : 
      67  1156218129 :                 if (c1 == c2) {
      68   820948021 :                         continue;
      69             :                 }
      70             : 
      71   335270108 :                 u1 = toupper_m(c1);
      72   335270108 :                 u2 = toupper_m(c2);
      73   335270108 :                 if (u1 == u2) {
      74     1618811 :                         continue;
      75             :                 }
      76             : 
      77   333651297 :                 l1 = tolower_m(c1);
      78   333651297 :                 l2 = tolower_m(c2);
      79   333651297 :                 if (l1 == l2) {
      80           0 :                         continue;
      81             :                 }
      82             : 
      83   333651297 :                 return NUMERIC_CMP(l1, l2);
      84             :         }
      85             : 
      86    17897817 :         return NUMERIC_CMP(*s1, *s2);
      87             : }
      88             : 
      89             : /**
      90             :  Case insensitive string comparison
      91             : **/
      92   351549880 : _PUBLIC_ int strcasecmp_m(const char *s1, const char *s2)
      93             : {
      94   351549880 :         struct smb_iconv_handle *iconv_handle = get_iconv_handle();
      95   351549880 :         return strcasecmp_m_handle(iconv_handle, s1, s2);
      96             : }
      97             : 
      98             : /**
      99             :  Case insensitive string comparison, length limited, handle specified for
     100             :  testing
     101             : **/
     102     7640199 : _PUBLIC_ int strncasecmp_m_handle(struct smb_iconv_handle *iconv_handle,
     103             :                                   const char *s1, const char *s2, size_t n)
     104             : {
     105     7640199 :         codepoint_t c1=0, c2=0;
     106     7640199 :         codepoint_t u1=0, u2=0;
     107     7640199 :         codepoint_t l1=0, l2=0;
     108        8532 :         size_t size1, size2;
     109             : 
     110             :         /* handle null ptr comparisons to simplify the use in qsort */
     111     7640199 :         if (s1 == s2) return 0;
     112     7639913 :         if (s1 == NULL) return -1;
     113     7639912 :         if (s2 == NULL) return 1;
     114             : 
     115    19255188 :         while (*s1 && *s2 && n) {
     116    18408934 :                 n--;
     117             : 
     118    18408934 :                 c1 = next_codepoint_handle(iconv_handle, s1, &size1);
     119    18408934 :                 c2 = next_codepoint_handle(iconv_handle, s2, &size2);
     120             : 
     121    18408934 :                 if (c1 == INVALID_CODEPOINT ||
     122       25208 :                     c2 == INVALID_CODEPOINT) {
     123             :                         /*
     124             :                          * n was specified in characters,
     125             :                          * now we must convert it to bytes.
     126             :                          * As bytes are the smallest
     127             :                          * character unit, the following
     128             :                          * increment and strncasecmp is always
     129             :                          * safe.
     130             :                          *
     131             :                          * The source string was already known
     132             :                          * to be n characters long, so we are
     133             :                          * guaranteed to be able to look at the
     134             :                          * (n remaining + size1) bytes from the
     135             :                          * s1 position).
     136             :                          */
     137           1 :                         n += size1;
     138           1 :                         return strncasecmp(s1, s2, n);
     139             :                 }
     140             : 
     141    18408933 :                 s1 += size1;
     142    18408933 :                 s2 += size2;
     143             : 
     144    18408933 :                 if (c1 == c2) {
     145    11594600 :                         continue;
     146             :                 }
     147             : 
     148     6814333 :                 u1 = toupper_m(c1);
     149     6814333 :                 u2 = toupper_m(c2);
     150     6814333 :                 if (u1 == u2) {
     151       20677 :                         continue;
     152             :                 }
     153             : 
     154     6793656 :                 l1 = tolower_m(c1);
     155     6793656 :                 l2 = tolower_m(c2);
     156     6793656 :                 if (l1 == l2) {
     157           0 :                         continue;
     158             :                 }
     159             : 
     160     6793656 :                 return NUMERIC_CMP(l1, l2);
     161             :         }
     162             : 
     163      846254 :         if (n == 0) {
     164      837060 :                 return 0;
     165             :         }
     166             : 
     167        6946 :         return NUMERIC_CMP(*s1, *s2);
     168             : }
     169             : 
     170             : /**
     171             :  Case insensitive string comparison, length limited
     172             : **/
     173     7640187 : _PUBLIC_ int strncasecmp_m(const char *s1, const char *s2, size_t n)
     174             : {
     175     7640187 :         struct smb_iconv_handle *iconv_handle = get_iconv_handle();
     176     7640187 :         return strncasecmp_m_handle(iconv_handle, s1, s2, n);
     177             : }
     178             : 
     179             : /**
     180             :  * Compare 2 strings.
     181             :  *
     182             :  * @note The comparison is case-insensitive.
     183             :  **/
     184       99950 : _PUBLIC_ bool strequal_m(const char *s1, const char *s2)
     185             : {
     186       99950 :         return strcasecmp_m(s1,s2) == 0;
     187             : }
     188             : 
     189             : /**
     190             :  Compare 2 strings (case sensitive).
     191             : **/
     192     3333838 : _PUBLIC_ bool strcsequal(const char *s1,const char *s2)
     193             : {
     194     3333838 :         if (s1 == s2)
     195          40 :                 return true;
     196     3333790 :         if (!s1 || !s2)
     197           0 :                 return false;
     198             : 
     199     3333788 :         return strcmp(s1,s2) == 0;
     200             : }
     201             : 
     202             : /**
     203             :  * Calculate the number of units (8 or 16-bit, depending on the
     204             :  * destination charset) that would be needed to convert the input
     205             :  * string, which is expected to be in src_charset encoding, to the
     206             :  * destination charset (which should be a unicode charset).
     207             :  */
     208    40647491 : _PUBLIC_ size_t strlen_m_ext_handle(struct smb_iconv_handle *ic,
     209             :                                     const char *s, charset_t src_charset, charset_t dst_charset)
     210             : {
     211    40647491 :         size_t count = 0;
     212             : 
     213             : #ifdef DEVELOPER
     214    40647491 :         switch (dst_charset) {
     215           0 :         case CH_DOS:
     216             :         case CH_UNIX:
     217           0 :                 smb_panic("cannot call strlen_m_ext() with a variable dest charset (must be UTF16* or UTF8)");
     218    39783335 :         default:
     219    40647491 :                 break;
     220             :         }
     221             : 
     222    40647491 :         switch (src_charset) {
     223           0 :         case CH_UTF16LE:
     224             :         case CH_UTF16BE:
     225           0 :                 smb_panic("cannot call strlen_m_ext() with a UTF16 src charset (must be DOS, UNIX, DISPLAY or UTF8)");
     226    39783335 :         default:
     227    40647491 :                 break;
     228             :         }
     229             : #endif
     230    40647491 :         if (!s) {
     231       66134 :                 return 0;
     232             :         }
     233             : 
     234  1185643024 :         while (*s && !(((uint8_t)*s) & 0x80)) {
     235  1145065435 :                 s++;
     236  1145065435 :                 count++;
     237             :         }
     238             : 
     239    40577589 :         if (!*s) {
     240    39705340 :                 return count;
     241             :         }
     242             : 
     243      575596 :         while (*s) {
     244        3536 :                 size_t c_size;
     245      563707 :                 codepoint_t c = next_codepoint_handle_ext(ic, s, strnlen(s, 5),
     246             :                                                           src_charset, &c_size);
     247      563707 :                 s += c_size;
     248             : 
     249      563707 :                 switch (dst_charset) {
     250      555682 :                 case CH_UTF16LE:
     251             :                 case CH_UTF16BE:
     252             :                 case CH_UTF16MUNGED:
     253      555682 :                         if (c < 0x10000) {
     254             :                                 /* Unicode char fits into 16 bits. */
     255      492815 :                                 count += 1;
     256             :                         } else {
     257             :                                 /* Double-width unicode char - 32 bits. */
     258       62867 :                                 count += 2;
     259             :                         }
     260      553391 :                         break;
     261        8025 :                 case CH_UTF8:
     262             :                         /*
     263             :                          * this only checks ranges, and does not
     264             :                          * check for invalid codepoints
     265             :                          */
     266        8025 :                         if (c < 0x80) {
     267        6116 :                                 count += 1;
     268        1909 :                         } else if (c < 0x800) {
     269         871 :                                 count += 2;
     270        1038 :                         } else if (c < 0x10000) {
     271        1038 :                                 count += 3;
     272             :                         } else {
     273           0 :                                 count += 4;
     274             :                         }
     275        6780 :                         break;
     276           0 :                 default:
     277             :                         /*
     278             :                          * non-unicode encoding:
     279             :                          * assume that each codepoint fits into
     280             :                          * one unit in the destination encoding.
     281             :                          */
     282           0 :                         count += 1;
     283             :                 }
     284             :         }
     285             : 
     286       11861 :         return count;
     287             : }
     288             : 
     289             : /**
     290             :  * Calculate the number of units (8 or 16-bit, depending on the
     291             :  * destination charset) that would be needed to convert the input
     292             :  * string, which is expected to be in src_charset encoding, to the
     293             :  * destination charset (which should be a unicode charset).
     294             :  */
     295    40647479 : _PUBLIC_ size_t strlen_m_ext(const char *s, charset_t src_charset, charset_t dst_charset)
     296             : {
     297    40647479 :         struct smb_iconv_handle *ic = get_iconv_handle();
     298    40647479 :         return strlen_m_ext_handle(ic, s, src_charset, dst_charset);
     299             : }
     300             : 
     301    24878290 : _PUBLIC_ size_t strlen_m_ext_term(const char *s, const charset_t src_charset,
     302             :                                   const charset_t dst_charset)
     303             : {
     304    24878290 :         if (!s) {
     305       94456 :                 return 0;
     306             :         }
     307    24783478 :         return strlen_m_ext(s, src_charset, dst_charset) + 1;
     308             : }
     309             : 
     310      918300 : _PUBLIC_ size_t strlen_m_ext_term_null(const char *s,
     311             :                                        const charset_t src_charset,
     312             :                                        const charset_t dst_charset)
     313             : {
     314        1952 :         size_t len;
     315      918300 :         if (!s) {
     316         972 :                 return 0;
     317             :         }
     318      917327 :         len = strlen_m_ext(s, src_charset, dst_charset);
     319      917327 :         if (len == 0) {
     320      608805 :                 return 0;
     321             :         }
     322             : 
     323      308293 :         return len+1;
     324             : }
     325             : 
     326             : /**
     327             :  * Calculate the number of 16-bit units that would be needed to convert
     328             :  * the input string, which is expected to be in CH_UNIX encoding, to UTF16.
     329             :  *
     330             :  * This will be the same as the number of bytes in a string for single
     331             :  * byte strings, but will be different for multibyte.
     332             :  */
     333    14946668 : _PUBLIC_ size_t strlen_m(const char *s)
     334             : {
     335    14946668 :         return strlen_m_ext(s, CH_UNIX, CH_UTF16LE);
     336             : }
     337             : 
     338             : /**
     339             :    Work out the number of multibyte chars in a string, including the NULL
     340             :    terminator.
     341             : **/
     342     2239129 : _PUBLIC_ size_t strlen_m_term(const char *s)
     343             : {
     344     2239129 :         return strlen_m_ext_term(s, CH_UNIX, CH_UTF16LE);
     345             : }
     346             : 
     347             : /*
     348             :  * Weird helper routine for the winreg pipe: If nothing is around, return 0,
     349             :  * if a string is there, include the terminator.
     350             :  */
     351             : 
     352      918300 : _PUBLIC_ size_t strlen_m_term_null(const char *s)
     353             : {
     354      918300 :         return strlen_m_ext_term_null(s, CH_UNIX, CH_UTF16LE);
     355             : }
     356             : 
     357             : /**
     358             :  Strchr and strrchr_m are a bit complex on general multi-byte strings.
     359             : **/
     360   324151610 : _PUBLIC_ char *strchr_m(const char *src, char c)
     361             : {
     362     2025234 :         const char *s;
     363   324151610 :         struct smb_iconv_handle *ic = get_iconv_handle();
     364   324151610 :         if (src == NULL) {
     365           0 :                 return NULL;
     366             :         }
     367             :         /* characters below 0x3F are guaranteed to not appear in
     368             :            non-initial position in multi-byte charsets */
     369   324151610 :         if ((c & 0xC0) == 0) {
     370    95654283 :                 return strchr(src, c);
     371             :         }
     372             : 
     373             :         /* this is quite a common operation, so we want it to be
     374             :            fast. We optimise for the ascii case, knowing that all our
     375             :            supported multi-byte character sets are ascii-compatible
     376             :            (ie. they match for the first 128 chars) */
     377             : 
     378  1589012426 :         for (s = src; *s && !(((unsigned char)s[0]) & 0x80); s++) {
     379  1360580310 :                 if (*s == c)
     380       65211 :                         return discard_const_p(char, s);
     381             :         }
     382             : 
     383   228432116 :         if (!*s)
     384   227213390 :                 return NULL;
     385             : 
     386             : #ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS
     387             :         /* With compose characters we must restart from the beginning. JRA. */
     388             :         s = src;
     389             : #endif
     390             : 
     391           4 :         while (*s) {
     392           3 :                 size_t size;
     393           3 :                 codepoint_t c2 = next_codepoint_handle(ic, s, &size);
     394           3 :                 if (c2 == c) {
     395           0 :                         return discard_const_p(char, s);
     396             :                 }
     397           3 :                 s += size;
     398             :         }
     399             : 
     400           0 :         return NULL;
     401             : }
     402             : 
     403             : /**
     404             :  * Multibyte-character version of strrchr
     405             :  */
     406     7866674 : _PUBLIC_ char *strrchr_m(const char *s, char c)
     407             : {
     408       38997 :         struct smb_iconv_handle *ic;
     409     7866674 :         char *ret = NULL;
     410             : 
     411     7866674 :         if (s == NULL) {
     412           0 :                 return NULL;
     413             :         }
     414             : 
     415             :         /* characters below 0x3F are guaranteed to not appear in
     416             :            non-initial position in multi-byte charsets */
     417     7866674 :         if ((c & 0xC0) == 0) {
     418     7814253 :                 return strrchr(s, c);
     419             :         }
     420             : 
     421             :         /* this is quite a common operation, so we want it to be
     422             :            fast. We optimise for the ascii case, knowing that all our
     423             :            supported multi-byte character sets are ascii-compatible
     424             :            (ie. they match for the first 128 chars). Also, in Samba
     425             :            we only search for ascii characters in 'c' and that
     426             :            in all mb character sets with a compound character
     427             :            containing c, if 'c' is not a match at position
     428             :            p, then p[-1] > 0x7f. JRA. */
     429             : 
     430             :         {
     431       52421 :                 size_t len = strlen(s);
     432       52421 :                 const char *cp = s;
     433       52421 :                 bool got_mb = false;
     434             : 
     435       52421 :                 if (len == 0)
     436         106 :                         return NULL;
     437       52315 :                 cp += (len - 1);
     438        1694 :                 do {
     439      342033 :                         if (c == *cp) {
     440             :                                 /* Could be a match. Part of a multibyte ? */
     441       34216 :                                 if ((cp > s) &&
     442       32340 :                                         (((unsigned char)cp[-1]) & 0x80)) {
     443             :                                         /* Yep - go slow :-( */
     444           0 :                                         got_mb = true;
     445           0 :                                         break;
     446             :                                 }
     447             :                                 /* No - we have a match ! */
     448       34035 :                                 return discard_const_p(char , cp);
     449             :                         }
     450      307817 :                 } while (cp-- != s);
     451       18064 :                 if (!got_mb)
     452       18064 :                         return NULL;
     453             :         }
     454             : 
     455           0 :         ic = get_iconv_handle();
     456             : 
     457           0 :         while (*s) {
     458           0 :                 size_t size;
     459           0 :                 codepoint_t c2 = next_codepoint_handle(ic, s, &size);
     460           0 :                 if (c2 == c) {
     461           0 :                         ret = discard_const_p(char, s);
     462             :                 }
     463           0 :                 s += size;
     464             :         }
     465             : 
     466           0 :         return ret;
     467             : }
     468             : 
     469             : /**
     470             :   return True if any (multi-byte) character is lower case
     471             : */
     472          35 : _PUBLIC_ bool strhaslower_handle(struct smb_iconv_handle *ic,
     473             :                                  const char *string)
     474             : {
     475         963 :         while (*string) {
     476         950 :                 size_t c_size;
     477         950 :                 codepoint_t s;
     478         950 :                 codepoint_t t;
     479             : 
     480         950 :                 s = next_codepoint_handle(ic, string, &c_size);
     481         950 :                 string += c_size;
     482             : 
     483         950 :                 t = toupper_m(s);
     484             : 
     485         950 :                 if (s != t) {
     486          22 :                         return true; /* that means it has lower case chars */
     487             :                 }
     488             :         }
     489             : 
     490           0 :         return false;
     491             : }
     492             : 
     493          17 : _PUBLIC_ bool strhaslower(const char *string)
     494             : {
     495          17 :         struct smb_iconv_handle *ic = get_iconv_handle();
     496          17 :         return strhaslower_handle(ic, string);
     497             : }
     498             : 
     499             : /**
     500             :   return True if any (multi-byte) character is upper case
     501             : */
     502          35 : _PUBLIC_ bool strhasupper_handle(struct smb_iconv_handle *ic,
     503             :                                  const char *string)
     504             : {
     505         954 :         while (*string) {
     506         941 :                 size_t c_size;
     507         941 :                 codepoint_t s;
     508         941 :                 codepoint_t t;
     509             : 
     510         941 :                 s = next_codepoint_handle(ic, string, &c_size);
     511         941 :                 string += c_size;
     512             : 
     513         941 :                 t = tolower_m(s);
     514             : 
     515         941 :                 if (s != t) {
     516          22 :                         return true; /* that means it has upper case chars */
     517             :                 }
     518             :         }
     519             : 
     520           0 :         return false;
     521             : }
     522             : 
     523          17 : _PUBLIC_ bool strhasupper(const char *string)
     524             : {
     525          17 :         struct smb_iconv_handle *ic = get_iconv_handle();
     526          17 :         return strhasupper_handle(ic, string);
     527             : }
     528             : 
     529             : /***********************************************************************
     530             :  strstr_m - We convert via ucs2 for now.
     531             : ***********************************************************************/
     532             : 
     533     2374860 : char *strstr_m(const char *src, const char *findstr)
     534             : {
     535     2374860 :         TALLOC_CTX *mem_ctx = NULL;
     536        9961 :         smb_ucs2_t *p;
     537        9961 :         smb_ucs2_t *src_w, *find_w;
     538        9961 :         const char *s;
     539        9961 :         char *s2;
     540     2374860 :         char *retp = NULL;
     541     2374860 :         size_t converted_size, findstr_len = 0;
     542             : 
     543             :         /* for correctness */
     544     2374860 :         if (!findstr[0]) {
     545           0 :                 return discard_const_p(char, src);
     546             :         }
     547             : 
     548             :         /* Samba does single character findstr calls a *lot*. */
     549     2374858 :         if (findstr[1] == '\0')
     550      109479 :                 return strchr_m(src, *findstr);
     551             : 
     552             :         /* We optimise for the ascii case, knowing that all our
     553             :            supported multi-byte character sets are ascii-compatible
     554             :            (ie. they match for the first 128 chars) */
     555             : 
     556    46564204 :         for (s = src; *s && !(((unsigned char)s[0]) & 0x80); s++) {
     557    45085253 :                 if (*s == *findstr) {
     558     2361051 :                         if (!findstr_len)
     559     1381397 :                                 findstr_len = strlen(findstr);
     560             : 
     561     2361051 :                         if (strncmp(s, findstr, findstr_len) == 0) {
     562      786428 :                                 return discard_const_p(char, s);
     563             :                         }
     564             :                 }
     565             :         }
     566             : 
     567     1478951 :         if (!*s)
     568     1474753 :                 return NULL;
     569             : 
     570             : #if 1 /* def BROKEN_UNICODE_COMPOSE_CHARACTERS */
     571             :         /* 'make check' fails unless we do this */
     572             : 
     573             :         /* With compose characters we must restart from the beginning. JRA. */
     574           9 :         s = src;
     575             : #endif
     576             : 
     577             :         /*
     578             :          * Use get_iconv_handle() just as a non-NULL talloc ctx. In
     579             :          * case we leak memory, this should then be more obvious in
     580             :          * the talloc report.
     581             :          */
     582           9 :         mem_ctx = talloc_new(get_iconv_handle());
     583           9 :         if (mem_ctx == NULL) {
     584           0 :                 return NULL;
     585             :         }
     586             : 
     587           9 :         if (!push_ucs2_talloc(mem_ctx, &src_w, src, &converted_size)) {
     588           0 :                 goto done;
     589             :         }
     590             : 
     591           9 :         if (!push_ucs2_talloc(mem_ctx, &find_w, findstr, &converted_size)) {
     592           3 :                 goto done;
     593             :         }
     594             : 
     595           6 :         p = strstr_w(src_w, find_w);
     596             : 
     597           6 :         if (!p) {
     598           3 :                 goto done;
     599             :         }
     600             : 
     601           3 :         *p = 0;
     602           3 :         if (!pull_ucs2_talloc(mem_ctx, &s2, src_w, &converted_size)) {
     603           0 :                 goto done;
     604             :         }
     605           3 :         retp = discard_const_p(char, (s+strlen(s2)));
     606           9 : done:
     607           9 :         TALLOC_FREE(mem_ctx);
     608           9 :         return retp;
     609             : }

Generated by: LCOV version 1.14