1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  *
  25  * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
  26  */
  27 
  28 /*
  29  * Multibyte/wide-char conversion routines. Wide-char encoding provides
  30  * a fixed size character encoding that maps to the Unicode 16-bit
  31  * (UCS-2) character set standard. Multibyte or UCS transformation
  32  * format (UTF) encoding is a variable length character encoding scheme
  33  * that s compatible with existing ASCII characters and guarantees that
  34  * the resultant strings do not contain embedded null characters. Both
  35  * types of encoding provide a null terminator: single byte for UTF-8
  36  * and a wide-char null for Unicode. See RFC 2044.
  37  *
  38  * The table below illustrates the UTF-8 encoding scheme. The letter x
  39  * indicates bits available for encoding the character value.
  40  *
  41  *      UCS-2                   UTF-8 octet sequence (binary)
  42  *      0x0000-0x007F   0xxxxxxx
  43  *      0x0080-0x07FF   110xxxxx 10xxxxxx
  44  *      0x0800-0xFFFF   1110xxxx 10xxxxxx 10xxxxxx
  45  *
  46  * RFC 2044
  47  * UTF-8,a transformation format of UNICODE and ISO 10646
  48  * F. Yergeau
  49  * Alis Technologies
  50  * October 1996
  51  */
  52 
  53 #if defined(_KERNEL) || defined(_FAKE_KERNEL)
  54 #include <sys/types.h>
  55 #include <sys/sunddi.h>
  56 #else
  57 #include <stdio.h>
  58 #include <stdlib.h>
  59 #include <assert.h>
  60 #include <strings.h>
  61 #endif
  62 #include <smbsrv/string.h>
  63 
  64 
  65 /*
  66  * mbstowcs
  67  *
  68  * The mbstowcs() function converts a multibyte character string
  69  * mbstring into a wide character string wcstring. No more than
  70  * nwchars wide characters are stored. A terminating null wide
  71  * character is appended if there is room.
  72  *
  73  * Returns the number of wide characters converted, not counting
  74  * any terminating null wide character. Returns -1 if an invalid
  75  * multibyte character is encountered.
  76  */
  77 size_t
  78 smb_mbstowcs(smb_wchar_t *wcstring, const char *mbstring, size_t nwchars)
  79 {
  80         int len;
  81         smb_wchar_t     *start = wcstring;
  82 
  83         while (nwchars--) {
  84                 len = smb_mbtowc(wcstring, mbstring, MTS_MB_CHAR_MAX);
  85                 if (len < 0) {
  86                         *wcstring = 0;
  87                         return ((size_t)-1);
  88                 }
  89 
  90                 if (*mbstring == 0)
  91                         break;
  92 
  93                 ++wcstring;
  94                 mbstring += len;
  95         }
  96 
  97         return (wcstring - start);
  98 }
  99 
 100 
 101 /*
 102  * mbtowc
 103  *
 104  * The mbtowc() function converts a multibyte character mbchar into
 105  * a wide character and stores the result in the object pointed to
 106  * by wcharp. Up to nbytes bytes are examined.
 107  *
 108  * If mbchar is NULL, mbtowc() returns zero to indicate that shift
 109  * states are not supported.  Shift states are used to switch between
 110  * representation modes using reserved bytes to signal shifting
 111  * without them being interpreted as characters.  If mbchar is null
 112  * mbtowc should return non-zero if the current locale requires shift
 113  * states.  Otherwise it should be return 0.
 114  *
 115  * If mbchar is non-null, returns the number of bytes processed in
 116  * mbchar.  If mbchar is invalid, returns -1.
 117  */
 118 int /*ARGSUSED*/
 119 smb_mbtowc(smb_wchar_t *wcharp, const char *mbchar, size_t nbytes)
 120 {
 121         unsigned char mbyte;
 122         smb_wchar_t wide_char;
 123         int count;
 124         int bytes_left;
 125 
 126         if (mbchar == NULL)
 127                 return (0); /* no shift states */
 128 
 129         /* 0xxxxxxx -> 1 byte ASCII encoding */
 130         if (((mbyte = *mbchar++) & 0x80) == 0) {
 131                 if (wcharp)
 132                         *wcharp = (smb_wchar_t)mbyte;
 133 
 134                 return (mbyte ? 1 : 0);
 135         }
 136 
 137         /* 10xxxxxx -> invalid first byte */
 138         if ((mbyte & 0x40) == 0)
 139                 return (-1);
 140 
 141         wide_char = mbyte;
 142         if ((mbyte & 0x20) == 0) {
 143                 wide_char &= 0x1f;
 144                 bytes_left = 1;
 145         } else if ((mbyte & 0x10) == 0) {
 146                 wide_char &= 0x0f;
 147                 bytes_left = 2;
 148         } else {
 149                 return (-1);
 150         }
 151 
 152         count = 1;
 153         while (bytes_left--) {
 154                 if (((mbyte = *mbchar++) & 0xc0) != 0x80)
 155                         return (-1);
 156 
 157                 count++;
 158                 wide_char = (wide_char << 6) | (mbyte & 0x3f);
 159         }
 160 
 161         if (wcharp)
 162                 *wcharp = wide_char;
 163 
 164         return (count);
 165 }
 166 
 167 
 168 /*
 169  * wctomb
 170  *
 171  * The wctomb() function converts a wide character wchar into a multibyte
 172  * character and stores the result in mbchar. The object pointed to by
 173  * mbchar must be large enough to accommodate the multibyte character.
 174  *
 175  * Returns the numberof bytes written to mbchar.
 176  */
 177 int
 178 smb_wctomb(char *mbchar, smb_wchar_t wchar)
 179 {
 180         if ((wchar & ~0x7f) == 0) {
 181                 *mbchar = (char)wchar;
 182                 return (1);
 183         }
 184 
 185         if ((wchar & ~0x7ff) == 0) {
 186                 *mbchar++ = (wchar >> 6) | 0xc0;
 187                 *mbchar = (wchar & 0x3f) | 0x80;
 188                 return (2);
 189         }
 190 
 191         *mbchar++ = (wchar >> 12) | 0xe0;
 192         *mbchar++ = ((wchar >> 6) & 0x3f) | 0x80;
 193         *mbchar = (wchar & 0x3f) | 0x80;
 194         return (3);
 195 }
 196 
 197 
 198 /*
 199  * wcstombs
 200  *
 201  * The wcstombs() function converts a wide character string wcstring
 202  * into a multibyte character string mbstring. Up to nbytes bytes are
 203  * stored in mbstring. Partial multibyte characters at the end of the
 204  * string are not stored. The multibyte character string is null
 205  * terminated if there is room.
 206  *
 207  * Returns the number of bytes converted, not counting the terminating
 208  * null byte.
 209  */
 210 size_t
 211 smb_wcstombs(char *mbstring, const smb_wchar_t *wcstring, size_t nbytes)
 212 {
 213         char *start = mbstring;
 214         const smb_wchar_t *wcp = wcstring;
 215         smb_wchar_t wide_char = 0;
 216         char buf[4];
 217         size_t len;
 218 
 219         if ((mbstring == NULL) || (wcstring == NULL))
 220                 return (0);
 221 
 222         while (nbytes > MTS_MB_CHAR_MAX) {
 223                 wide_char = *wcp++;
 224                 len = smb_wctomb(mbstring, wide_char);
 225 
 226                 if (wide_char == 0)
 227                         /*LINTED E_PTRDIFF_OVERFLOW*/
 228                         return (mbstring - start);
 229 
 230                 mbstring += len;
 231                 nbytes -= len;
 232         }
 233 
 234         while (wide_char && nbytes) {
 235                 wide_char = *wcp++;
 236                 if ((len = smb_wctomb(buf, wide_char)) > nbytes) {
 237                         *mbstring = 0;
 238                         break;
 239                 }
 240 
 241                 bcopy(buf, mbstring, len);
 242                 mbstring += len;
 243                 nbytes -= len;
 244         }
 245 
 246         /*LINTED E_PTRDIFF_OVERFLOW*/
 247         return (mbstring - start);
 248 }
 249 
 250 
 251 /*
 252  * Returns the number of bytes that would be written if the multi-
 253  * byte string mbs was converted to a wide character string, not
 254  * counting the terminating null wide character.
 255  */
 256 size_t
 257 smb_wcequiv_strlen(const char *mbs)
 258 {
 259         smb_wchar_t     wide_char;
 260         size_t bytes;
 261         size_t len = 0;
 262 
 263         while (*mbs) {
 264                 bytes = smb_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX);
 265                 if (bytes == ((size_t)-1))
 266                         return ((size_t)-1);
 267 
 268                 len += sizeof (smb_wchar_t);
 269                 mbs += bytes;
 270         }
 271 
 272         return (len);
 273 }
 274 
 275 
 276 /*
 277  * Returns the number of bytes that would be written if the multi-
 278  * byte string mbs was converted to a single byte character string,
 279  * not counting the terminating null character.
 280  */
 281 size_t
 282 smb_sbequiv_strlen(const char *mbs)
 283 {
 284         smb_wchar_t     wide_char;
 285         size_t nbytes;
 286         size_t len = 0;
 287 
 288         while (*mbs) {
 289                 nbytes = smb_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX);
 290                 if (nbytes == ((size_t)-1))
 291                         return ((size_t)-1);
 292 
 293                 if (wide_char & 0xFF00)
 294                         len += sizeof (smb_wchar_t);
 295                 else
 296                         ++len;
 297 
 298                 mbs += nbytes;
 299         }
 300 
 301         return (len);
 302 }
 303 
 304 
 305 /*
 306  * stombs
 307  *
 308  * Convert a regular null terminated string 'string' to a UTF-8 encoded
 309  * null terminated multi-byte string 'mbstring'. Only full converted
 310  * UTF-8 characters will be written 'mbstring'. If a character will not
 311  * fit within the remaining buffer space or 'mbstring' will overflow
 312  * max_mblen, the conversion process will be terminated and 'mbstring'
 313  * will be null terminated.
 314  *
 315  * Returns the number of bytes written to 'mbstring', excluding the
 316  * terminating null character.
 317  *
 318  * If either mbstring or string is a null pointer, -1 is returned.
 319  */
 320 int
 321 smb_stombs(char *mbstring, char *string, int max_mblen)
 322 {
 323         char *start = mbstring;
 324         unsigned char *p = (unsigned char *)string;
 325         int space_left = max_mblen;
 326         int     len;
 327         smb_wchar_t     wide_char;
 328         char buf[4];
 329 
 330         if (!mbstring || !string)
 331                 return (-1);
 332 
 333         while (*p && space_left > 2) {
 334                 wide_char = *p++;
 335                 len = smb_wctomb(mbstring, wide_char);
 336                 mbstring += len;
 337                 space_left -= len;
 338         }
 339 
 340         if (*p) {
 341                 wide_char = *p;
 342                 if ((len = smb_wctomb(buf, wide_char)) < 2) {
 343                         *mbstring = *buf;
 344                         mbstring += len;
 345                         space_left -= len;
 346                 }
 347         }
 348 
 349         *mbstring = '\0';
 350 
 351         /*LINTED E_PTRDIFF_OVERFLOW*/
 352         return (mbstring - start);
 353 }
 354 
 355 
 356 /*
 357  * mbstos
 358  *
 359  * Convert a null terminated multi-byte string 'mbstring' to a regular
 360  * null terminated string 'string'.  A 1-byte character in 'mbstring'
 361  * maps to a 1-byte character in 'string'. A 2-byte character in
 362  * 'mbstring' will be mapped to 2-bytes, if the upper byte is non-null.
 363  * Otherwise the upper byte null will be discarded to ensure that the
 364  * output stream does not contain embedded null characters.
 365  *
 366  * If the input stream contains invalid multi-byte characters, a value
 367  * of -1 will be returned. Otherwise the length of 'string', excluding
 368  * the terminating null character, is returned.
 369  *
 370  * If either mbstring or string is a null pointer, -1 is returned.
 371  */
 372 int
 373 smb_mbstos(char *string, const char *mbstring)
 374 {
 375         smb_wchar_t wc;
 376         unsigned char *start = (unsigned char *)string;
 377         int len;
 378 
 379         if (string == NULL || mbstring == NULL)
 380                 return (-1);
 381 
 382         while (*mbstring) {
 383                 if ((len = smb_mbtowc(&wc, mbstring, MTS_MB_CHAR_MAX)) < 0) {
 384                         *string = 0;
 385                         return (-1);
 386                 }
 387 
 388                 if (wc & 0xFF00) {
 389                         /*LINTED E_BAD_PTR_CAST_ALIGN*/
 390                         *((smb_wchar_t *)string) = wc;
 391                         string += sizeof (smb_wchar_t);
 392                 }
 393                 else
 394                 {
 395                         *string = (unsigned char)wc;
 396                         string++;
 397                 }
 398 
 399                 mbstring += len;
 400         }
 401 
 402         *string = 0;
 403 
 404         /*LINTED E_PTRDIFF_OVERFLOW*/
 405         return ((unsigned char *)string - start);
 406 }