1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  *
  25  * Copyright 2018 Nexenta Systems, Inc.  All rights reserved.
  26  */
  27 
  28 /*
  29  * Multibyte/wide-char conversion routines. SMB uses UTF-16 on the wire
  30  * (smb_wchar_t) and we use UTF-8 internally (our multi-byte, or mbs).
  31  */
  32 
  33 #if defined(_KERNEL) || defined(_FAKE_KERNEL)
  34 #include <sys/types.h>
  35 #include <sys/sunddi.h>
  36 #else   /* _KERNEL || _FAKE_KERNEL */
  37 #include <stdio.h>
  38 #include <stdlib.h>
  39 #include <strings.h>
  40 #include <iconv.h>
  41 #include <assert.h>
  42 #endif  /* _KERNEL || _FAKE_KERNEL */
  43 #include <sys/u8_textprep.h>
  44 #include <smbsrv/string.h>
  45 
  46 
  47 /*
  48  * mbstowcs
  49  *
  50  * The mbstowcs() function converts a multibyte character string
  51  * mbstring into a wide character string wcstring. No more than
  52  * nwchars wide characters are stored. A terminating null wide
  53  * character is appended if there is room.
  54  *
  55  * Returns the number of wide characters converted, not counting
  56  * any terminating null wide character. Returns -1 if an invalid
  57  * multibyte character is encountered.
  58  */
  59 size_t
  60 smb_mbstowcs(smb_wchar_t *wcs, const char *mbs, size_t nwchars)
  61 {
  62         size_t mbslen, wcslen;
  63         int err;
  64 
  65         /* NULL or empty input is allowed. */
  66         if (mbs == NULL || *mbs == '\0') {
  67                 if (wcs != NULL && nwchars > 0)
  68                         *wcs = 0;
  69                 return (0);
  70         }
  71 
  72         /*
  73          * Traditional mbstowcs(3C) allows wcs==NULL to get the length.
  74          * SMB never calls it that way, but let's future-proof.
  75          */
  76         if (wcs == NULL) {
  77                 return ((size_t)-1);
  78         }
  79 
  80         mbslen = strlen(mbs);
  81         wcslen = nwchars;
  82         err = uconv_u8tou16((const uchar_t *)mbs, &mbslen,
  83             wcs, &wcslen, UCONV_OUT_LITTLE_ENDIAN);
  84         if (err != 0)
  85                 return ((size_t)-1);
  86 
  87         if (wcslen < nwchars)
  88                 wcs[wcslen] = 0;
  89 
  90         return (wcslen);
  91 }
  92 
  93 
  94 /*
  95  * mbtowc
  96  *
  97  * The mbtowc() function converts a multibyte character mbchar into
  98  * a wide character and stores the result in the object pointed to
  99  * by wcharp. Up to nbytes bytes are examined.
 100  *
 101  * If mbchar is NULL, mbtowc() returns zero to indicate that shift
 102  * states are not supported.  Shift states are used to switch between
 103  * representation modes using reserved bytes to signal shifting
 104  * without them being interpreted as characters.  If mbchar is null
 105  * mbtowc should return non-zero if the current locale requires shift
 106  * states.  Otherwise it should be return 0.
 107  *
 108  * If mbchar is non-null, returns the number of bytes processed in
 109  * mbchar.  If mbchar is null, convert the null (wcharp=0) but
 110  * return length zero.  If mbchar is invalid, returns -1.
 111  */
 112 int /*ARGSUSED*/
 113 smb_mbtowc(uint32_t *wcharp, const char *mbchar, size_t nbytes)
 114 {
 115         uint32_t wide_char;
 116         int count, err;
 117         size_t mblen;
 118         size_t wclen;
 119 
 120         if (mbchar == NULL)
 121                 return (0); /* no shift states */
 122 
 123         /*
 124          * How many bytes in this symbol?
 125          */
 126         count = u8_validate((char *)mbchar, nbytes, NULL, 0, &err);
 127         if (count < 0)
 128                 return (-1);
 129 
 130         mblen = count;
 131         wclen = 1;
 132         err = uconv_u8tou32((const uchar_t *)mbchar, &mblen,
 133             &wide_char, &wclen, UCONV_OUT_SYSTEM_ENDIAN);
 134         if (err != 0)
 135                 return (-1);
 136         if (wclen == 0) {
 137                 wide_char = 0;
 138                 count = 0;
 139         }
 140 
 141         if (wcharp)
 142                 *wcharp = wide_char;
 143 
 144         return (count);
 145 }
 146 
 147 
 148 /*
 149  * wctomb
 150  *
 151  * The wctomb() function converts a wide character wchar into a multibyte
 152  * character and stores the result in mbchar. The object pointed to by
 153  * mbchar must be large enough to accommodate the multibyte character.
 154  *
 155  * Returns the numberof bytes written to mbchar.
 156  * Note: handles null like any 1-byte char.
 157  */
 158 int
 159 smb_wctomb(char *mbchar, uint32_t wchar)
 160 {
 161         char junk[MTS_MB_CUR_MAX+1];
 162         size_t mblen;
 163         size_t wclen;
 164         int err;
 165 
 166         if (mbchar == NULL)
 167                 mbchar = junk;
 168 
 169         mblen = MTS_MB_CUR_MAX;
 170         wclen = 1;
 171         err = uconv_u32tou8(&wchar, &wclen, (uchar_t *)mbchar, &mblen,
 172             UCONV_IN_SYSTEM_ENDIAN | UCONV_IGNORE_NULL);
 173         if (err != 0)
 174                 return (-1);
 175 
 176         return ((int)mblen);
 177 }
 178 
 179 
 180 /*
 181  * wcstombs
 182  *
 183  * The wcstombs() function converts a wide character string wcstring
 184  * into a multibyte character string mbstring. Up to nbytes bytes are
 185  * stored in mbstring. Partial multibyte characters at the end of the
 186  * string are not stored. The multibyte character string is null
 187  * terminated if there is room.
 188  *
 189  * Returns the number of bytes converted, not counting the terminating
 190  * null byte. Returns -1 if an invalid WC sequence is encountered.
 191  */
 192 size_t
 193 smb_wcstombs(char *mbs, const smb_wchar_t *wcs, size_t nbytes)
 194 {
 195         size_t mbslen, wcslen;
 196         int err;
 197 
 198         /* NULL or empty input is allowed. */
 199         if (wcs == NULL || *wcs == 0) {
 200                 if (mbs != NULL && nbytes > 0)
 201                         *mbs = '\0';
 202                 return (0);
 203         }
 204 
 205         /*
 206          * Traditional wcstombs(3C) allows mbs==NULL to get the length.
 207          * SMB never calls it that way, but let's future-proof.
 208          */
 209         if (mbs == NULL) {
 210                 return ((size_t)-1);
 211         }
 212 
 213         /*
 214          * Compute wcslen
 215          */
 216         wcslen = 0;
 217         while (wcs[wcslen] != 0)
 218                 wcslen++;
 219 
 220         mbslen = nbytes;
 221         err = uconv_u16tou8(wcs, &wcslen,
 222             (uchar_t *)mbs, &mbslen, UCONV_IN_LITTLE_ENDIAN);
 223         if (err != 0)
 224                 return ((size_t)-1);
 225 
 226         if (mbslen < nbytes)
 227                 mbs[mbslen] = '\0';
 228 
 229         return (mbslen);
 230 }
 231 
 232 
 233 /*
 234  * Returns the number of bytes that would be written if the multi-
 235  * byte string mbs was converted to a wide character string, not
 236  * counting the terminating null wide character.
 237  */
 238 size_t
 239 smb_wcequiv_strlen(const char *mbs)
 240 {
 241         uint32_t        wide_char;
 242         size_t bytes;
 243         size_t len = 0;
 244 
 245         while (*mbs) {
 246                 bytes = smb_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX);
 247                 if (bytes == ((size_t)-1))
 248                         return ((size_t)-1);
 249                 mbs += bytes;
 250 
 251                 len += sizeof (smb_wchar_t);
 252                 if (bytes > 3) {
 253                         /*
 254                          * Extended unicode, so TWO smb_wchar_t
 255                          */
 256                         len += sizeof (smb_wchar_t);
 257                 }
 258         }
 259 
 260         return (len);
 261 }
 262 
 263 
 264 /*
 265  * Returns the number of bytes that would be written if the multi-
 266  * byte string mbs was converted to an OEM character string,
 267  * (smb_mbstooem) not counting the terminating null character.
 268  */
 269 size_t
 270 smb_sbequiv_strlen(const char *mbs)
 271 {
 272         size_t nbytes;
 273         size_t len = 0;
 274 
 275         while (*mbs) {
 276                 nbytes = smb_mbtowc(NULL, mbs, MTS_MB_CHAR_MAX);
 277                 if (nbytes == ((size_t)-1))
 278                         return ((size_t)-1);
 279                 if (nbytes == 0)
 280                         break;
 281 
 282                 if (nbytes == 1) {
 283                         /* ASCII */
 284                         len++;
 285                 } else if (nbytes < 8) {
 286                         /* Compute OEM length */
 287                         char mbsbuf[8];
 288                         uint8_t oembuf[8];
 289                         int oemlen;
 290                         (void) strlcpy(mbsbuf, mbs, nbytes+1);
 291                         oemlen = smb_mbstooem(oembuf, mbsbuf, 8);
 292                         if (oemlen < 0)
 293                                 return ((size_t)-1);
 294                         len += oemlen;
 295                 } else {
 296                         return ((size_t)-1);
 297                 }
 298 
 299                 mbs += nbytes;
 300         }
 301 
 302         return (len);
 303 }
 304 
 305 /*
 306  * Convert OEM strings to/from internal (UTF-8) form.
 307  *
 308  * We rarely encounter these anymore because all modern
 309  * SMB clients use Unicode (UTF-16). The few cases where
 310  * this IS still called are normally using ASCII, i.e.
 311  * tag names etc. so short-cut those cases.  If we get
 312  * something non-ASCII we have to call iconv.
 313  *
 314  * If we were to really support OEM code pages, we would
 315  * need to have a way to set the OEM code page from some
 316  * configuration value.  For now it's always CP850.
 317  * See also ./smb_oem.c
 318  */
 319 static char smb_oem_codepage[32] = "CP850";
 320 
 321 /*
 322  * stombs
 323  *
 324  * Convert a null terminated OEM string 'string' to a UTF-8 string
 325  * no longer than max_mblen (null terminated if space).
 326  *
 327  * If the input string contains invalid OEM characters, a value
 328  * of -1 will be returned. Otherwise returns the length of 'mbs',
 329  * excluding the terminating null character.
 330  *
 331  * If either mbstring or string is a null pointer, -1 is returned.
 332  */
 333 int
 334 smb_oemtombs(char *mbs, const uint8_t *oems, int max_mblen)
 335 {
 336         uchar_t *p;
 337         int     oemlen;
 338         int     rlen;
 339         boolean_t need_iconv = B_FALSE;
 340 
 341         if (mbs == NULL || oems == NULL)
 342                 return (-1);
 343 
 344         /*
 345          * Check if the oems is all ASCII (and get the length
 346          * while we're at it) so we know if we need to iconv.
 347          * We usually can avoid the iconv calls.
 348          */
 349         oemlen = 0;
 350         p = (uchar_t *)oems;
 351         while (*p != '\0') {
 352                 oemlen++;
 353                 if (*p & 0x80)
 354                         need_iconv = B_TRUE;
 355                 p++;
 356         }
 357 
 358         if (need_iconv) {
 359                 int     rc;
 360                 char    *obuf = mbs;
 361                 size_t  olen = max_mblen;
 362                 size_t  ilen = oemlen;
 363 #if defined(_KERNEL) || defined(_FAKE_KERNEL)
 364                 char *ibuf = (char *)oems;
 365                 kiconv_t ic;
 366                 int     err;
 367 
 368                 ic = kiconv_open("UTF-8", smb_oem_codepage);
 369                 if (ic == (kiconv_t)-1)
 370                         goto just_copy;
 371                 rc = kiconv(ic, &ibuf, &ilen, &obuf, &olen, &err);
 372                 (void) kiconv_close(ic);
 373 #else   /* _KERNEL || _FAKE_KERNEL */
 374                 const char *ibuf = (char *)oems;
 375                 iconv_t ic;
 376                 ic = iconv_open("UTF-8", smb_oem_codepage);
 377                 if (ic == (iconv_t)-1)
 378                         goto just_copy;
 379                 rc = iconv(ic, &ibuf, &ilen, &obuf, &olen);
 380                 (void) iconv_close(ic);
 381 #endif  /* _KERNEL || _FAKE_KERNEL */
 382                 if (rc < 0)
 383                         return (-1);
 384                 /* Return val. is output bytes. */
 385                 rlen = (max_mblen - olen);
 386         } else {
 387         just_copy:
 388                 rlen = oemlen;
 389                 if (rlen > max_mblen)
 390                         rlen = max_mblen;
 391                 bcopy(oems, mbs, rlen);
 392         }
 393         if (rlen < max_mblen)
 394                 mbs[rlen] = '\0';
 395 
 396         return (rlen);
 397 }
 398 
 399 
 400 /*
 401  * mbstos
 402  *
 403  * Convert a null terminated multi-byte string 'mbs' to an OEM string
 404  * no longer than max_oemlen (null terminated if space).
 405  *
 406  * If the input string contains invalid multi-byte characters, a value
 407  * of -1 will be returned. Otherwise returns the length of 'oems',
 408  * excluding the terminating null character.
 409  *
 410  * If either mbstring or string is a null pointer, -1 is returned.
 411  */
 412 int
 413 smb_mbstooem(uint8_t *oems, const char *mbs, int max_oemlen)
 414 {
 415         uchar_t *p;
 416         int     mbslen;
 417         int     rlen;
 418         boolean_t need_iconv = B_FALSE;
 419 
 420         if (oems == NULL || mbs == NULL)
 421                 return (-1);
 422 
 423         /*
 424          * Check if the mbs is all ASCII (and get the length
 425          * while we're at it) so we know if we need to iconv.
 426          * We usually can avoid the iconv calls.
 427          */
 428         mbslen = 0;
 429         p = (uchar_t *)mbs;
 430         while (*p != '\0') {
 431                 mbslen++;
 432                 if (*p & 0x80)
 433                         need_iconv = B_TRUE;
 434                 p++;
 435         }
 436 
 437         if (need_iconv) {
 438                 int     rc;
 439                 char    *obuf = (char *)oems;
 440                 size_t  olen = max_oemlen;
 441                 size_t  ilen = mbslen;
 442 #if defined(_KERNEL) || defined(_FAKE_KERNEL)
 443                 char *ibuf = (char *)mbs;
 444                 kiconv_t ic;
 445                 int     err;
 446 
 447                 ic = kiconv_open(smb_oem_codepage, "UTF-8");
 448                 if (ic == (kiconv_t)-1)
 449                         goto just_copy;
 450                 rc = kiconv(ic, &ibuf, &ilen, &obuf, &olen, &err);
 451                 (void) kiconv_close(ic);
 452 #else   /* _KERNEL || _FAKE_KERNEL */
 453                 const char *ibuf = mbs;
 454                 iconv_t ic;
 455                 ic = iconv_open(smb_oem_codepage, "UTF-8");
 456                 if (ic == (iconv_t)-1)
 457                         goto just_copy;
 458                 rc = iconv(ic, &ibuf, &ilen, &obuf, &olen);
 459                 (void) iconv_close(ic);
 460 #endif  /* _KERNEL || _FAKE_KERNEL */
 461                 if (rc < 0)
 462                         return (-1);
 463                 /* Return val. is output bytes. */
 464                 rlen = (max_oemlen - olen);
 465         } else {
 466         just_copy:
 467                 rlen = mbslen;
 468                 if (rlen > max_oemlen)
 469                         rlen = max_oemlen;
 470                 bcopy(mbs, oems, rlen);
 471         }
 472         if (rlen < max_oemlen)
 473                 oems[rlen] = '\0';
 474 
 475         return (rlen);
 476 }