Print this page
NEX-19712 SMB directory listings sometimes wrong after NEX-19025
Reviewed by: Matt Barden <matt.barden@nexenta.com>
Reviewed by: Joyce McIntosh <joyce.mcintosh@nexenta.com>
NEX-19025 CIFS gets confused with filenames containing enhanced Unicode
Reviewed by: Matt Barden <matt.barden@nexenta.com>
Reviewed by: Evan Layton <evan.layton@nexenta.com>
and: (fix build, check-rtime)
NEX-4458 Incorrect directory listing response for non-UNICODE clients
Reviewed by: Matt Barden <Matt.Barden@nexenta.com>
Reviewed by: Kevin Crowe <kevin.crowe@nexenta.com>
NEX-2460 libfksmbd should not link with libsmb
SMB-50 User-mode SMB server
 Includes work by these authors:
 Thomas Keiser <thomas.keiser@nexenta.com>
 Albert Lee <trisk@nexenta.com>
        
@@ -20,47 +20,29 @@
  */
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2018 Nexenta Systems, Inc.  All rights reserved.
  */
 
 /*
- * Multibyte/wide-char conversion routines. Wide-char encoding provides
- * a fixed size character encoding that maps to the Unicode 16-bit
- * (UCS-2) character set standard. Multibyte or UCS transformation
- * format (UTF) encoding is a variable length character encoding scheme
- * that s compatible with existing ASCII characters and guarantees that
- * the resultant strings do not contain embedded null characters. Both
- * types of encoding provide a null terminator: single byte for UTF-8
- * and a wide-char null for Unicode. See RFC 2044.
- *
- * The table below illustrates the UTF-8 encoding scheme. The letter x
- * indicates bits available for encoding the character value.
- *
- *      UCS-2                   UTF-8 octet sequence (binary)
- *      0x0000-0x007F   0xxxxxxx
- *      0x0080-0x07FF   110xxxxx 10xxxxxx
- *      0x0800-0xFFFF   1110xxxx 10xxxxxx 10xxxxxx
- *
- * RFC 2044
- * UTF-8,a transformation format of UNICODE and ISO 10646
- * F. Yergeau
- * Alis Technologies
- * October 1996
+ * Multibyte/wide-char conversion routines. SMB uses UTF-16 on the wire
+ * (smb_wchar_t) and we use UTF-8 internally (our multi-byte, or mbs).
  */
 
 #if defined(_KERNEL) || defined(_FAKE_KERNEL)
 #include <sys/types.h>
 #include <sys/sunddi.h>
-#else
+#else   /* _KERNEL || _FAKE_KERNEL */
 #include <stdio.h>
 #include <stdlib.h>
-#include <assert.h>
 #include <strings.h>
-#endif
+#include <iconv.h>
+#include <assert.h>
+#endif  /* _KERNEL || _FAKE_KERNEL */
+#include <sys/u8_textprep.h>
 #include <smbsrv/string.h>
 
 
 /*
  * mbstowcs
@@ -73,30 +55,41 @@
  * Returns the number of wide characters converted, not counting
  * any terminating null wide character. Returns -1 if an invalid
  * multibyte character is encountered.
  */
 size_t
-smb_mbstowcs(smb_wchar_t *wcstring, const char *mbstring, size_t nwchars)
+smb_mbstowcs(smb_wchar_t *wcs, const char *mbs, size_t nwchars)
 {
-        int len;
-        smb_wchar_t     *start = wcstring;
+        size_t mbslen, wcslen;
+        int err;
 
-        while (nwchars--) {
-                len = smb_mbtowc(wcstring, mbstring, MTS_MB_CHAR_MAX);
-                if (len < 0) {
-                        *wcstring = 0;
+        /* NULL or empty input is allowed. */
+        if (mbs == NULL || *mbs == '\0') {
+                if (wcs != NULL && nwchars > 0)
+                        *wcs = 0;
+                return (0);
+        }
+
+        /*
+         * Traditional mbstowcs(3C) allows wcs==NULL to get the length.
+         * SMB never calls it that way, but let's future-proof.
+         */
+        if (wcs == NULL) {
                         return ((size_t)-1);
                 }
 
-                if (*mbstring == 0)
-                        break;
+        mbslen = strlen(mbs);
+        wcslen = nwchars;
+        err = uconv_u8tou16((const uchar_t *)mbs, &mbslen,
+            wcs, &wcslen, UCONV_OUT_LITTLE_ENDIAN);
+        if (err != 0)
+                return ((size_t)-1);
 
-                ++wcstring;
-                mbstring += len;
-        }
+        if (wcslen < nwchars)
+                wcs[wcslen] = 0;
 
-        return (wcstring - start);
+        return (wcslen);
 }
 
 
 /*
  * mbtowc
@@ -111,55 +104,42 @@
  * without them being interpreted as characters.  If mbchar is null
  * mbtowc should return non-zero if the current locale requires shift
  * states.  Otherwise it should be return 0.
  *
  * If mbchar is non-null, returns the number of bytes processed in
- * mbchar.  If mbchar is invalid, returns -1.
+ * mbchar.  If mbchar is null, convert the null (wcharp=0) but
+ * return length zero.  If mbchar is invalid, returns -1.
  */
 int /*ARGSUSED*/
-smb_mbtowc(smb_wchar_t *wcharp, const char *mbchar, size_t nbytes)
+smb_mbtowc(uint32_t *wcharp, const char *mbchar, size_t nbytes)
 {
-        unsigned char mbyte;
-        smb_wchar_t wide_char;
-        int count;
-        int bytes_left;
+        uint32_t wide_char;
+        int count, err;
+        size_t mblen;
+        size_t wclen;
 
         if (mbchar == NULL)
                 return (0); /* no shift states */
 
-        /* 0xxxxxxx -> 1 byte ASCII encoding */
-        if (((mbyte = *mbchar++) & 0x80) == 0) {
-                if (wcharp)
-                        *wcharp = (smb_wchar_t)mbyte;
-
-                return (mbyte ? 1 : 0);
-        }
-
-        /* 10xxxxxx -> invalid first byte */
-        if ((mbyte & 0x40) == 0)
+        /*
+         * How many bytes in this symbol?
+         */
+        count = u8_validate((char *)mbchar, nbytes, NULL, 0, &err);
+        if (count < 0)
                 return (-1);
 
-        wide_char = mbyte;
-        if ((mbyte & 0x20) == 0) {
-                wide_char &= 0x1f;
-                bytes_left = 1;
-        } else if ((mbyte & 0x10) == 0) {
-                wide_char &= 0x0f;
-                bytes_left = 2;
-        } else {
+        mblen = count;
+        wclen = 1;
+        err = uconv_u8tou32((const uchar_t *)mbchar, &mblen,
+            &wide_char, &wclen, UCONV_OUT_SYSTEM_ENDIAN);
+        if (err != 0)
                 return (-1);
+        if (wclen == 0) {
+                wide_char = 0;
+                count = 0;
         }
 
-        count = 1;
-        while (bytes_left--) {
-                if (((mbyte = *mbchar++) & 0xc0) != 0x80)
-                        return (-1);
-
-                count++;
-                wide_char = (wide_char << 6) | (mbyte & 0x3f);
-        }
-
         if (wcharp)
                 *wcharp = wide_char;
 
         return (count);
 }
@@ -171,29 +151,31 @@
  * The wctomb() function converts a wide character wchar into a multibyte
  * character and stores the result in mbchar. The object pointed to by
  * mbchar must be large enough to accommodate the multibyte character.
  *
  * Returns the numberof bytes written to mbchar.
+ * Note: handles null like any 1-byte char.
  */
 int
-smb_wctomb(char *mbchar, smb_wchar_t wchar)
+smb_wctomb(char *mbchar, uint32_t wchar)
 {
-        if ((wchar & ~0x7f) == 0) {
-                *mbchar = (char)wchar;
-                return (1);
-        }
+        char junk[MTS_MB_CUR_MAX+1];
+        size_t mblen;
+        size_t wclen;
+        int err;
 
-        if ((wchar & ~0x7ff) == 0) {
-                *mbchar++ = (wchar >> 6) | 0xc0;
-                *mbchar = (wchar & 0x3f) | 0x80;
-                return (2);
-        }
+        if (mbchar == NULL)
+                mbchar = junk;
 
-        *mbchar++ = (wchar >> 12) | 0xe0;
-        *mbchar++ = ((wchar >> 6) & 0x3f) | 0x80;
-        *mbchar = (wchar & 0x3f) | 0x80;
-        return (3);
+        mblen = MTS_MB_CUR_MAX;
+        wclen = 1;
+        err = uconv_u32tou8(&wchar, &wclen, (uchar_t *)mbchar, &mblen,
+            UCONV_IN_SYSTEM_ENDIAN | UCONV_IGNORE_NULL);
+        if (err != 0)
+                return (-1);
+
+        return ((int)mblen);
 }
 
 
 /*
  * wcstombs
@@ -203,50 +185,50 @@
  * stored in mbstring. Partial multibyte characters at the end of the
  * string are not stored. The multibyte character string is null
  * terminated if there is room.
  *
  * Returns the number of bytes converted, not counting the terminating
- * null byte.
+ * null byte. Returns -1 if an invalid WC sequence is encountered.
  */
 size_t
-smb_wcstombs(char *mbstring, const smb_wchar_t *wcstring, size_t nbytes)
+smb_wcstombs(char *mbs, const smb_wchar_t *wcs, size_t nbytes)
 {
-        char *start = mbstring;
-        const smb_wchar_t *wcp = wcstring;
-        smb_wchar_t wide_char = 0;
-        char buf[4];
-        size_t len;
+        size_t mbslen, wcslen;
+        int err;
 
-        if ((mbstring == NULL) || (wcstring == NULL))
+        /* NULL or empty input is allowed. */
+        if (wcs == NULL || *wcs == 0) {
+                if (mbs != NULL && nbytes > 0)
+                        *mbs = '\0';
                 return (0);
-
-        while (nbytes > MTS_MB_CHAR_MAX) {
-                wide_char = *wcp++;
-                len = smb_wctomb(mbstring, wide_char);
-
-                if (wide_char == 0)
-                        /*LINTED E_PTRDIFF_OVERFLOW*/
-                        return (mbstring - start);
-
-                mbstring += len;
-                nbytes -= len;
         }
 
-        while (wide_char && nbytes) {
-                wide_char = *wcp++;
-                if ((len = smb_wctomb(buf, wide_char)) > nbytes) {
-                        *mbstring = 0;
-                        break;
+        /*
+         * Traditional wcstombs(3C) allows mbs==NULL to get the length.
+         * SMB never calls it that way, but let's future-proof.
+         */
+        if (mbs == NULL) {
+                return ((size_t)-1);
                 }
 
-                bcopy(buf, mbstring, len);
-                mbstring += len;
-                nbytes -= len;
-        }
+        /*
+         * Compute wcslen
+         */
+        wcslen = 0;
+        while (wcs[wcslen] != 0)
+                wcslen++;
 
-        /*LINTED E_PTRDIFF_OVERFLOW*/
-        return (mbstring - start);
+        mbslen = nbytes;
+        err = uconv_u16tou8(wcs, &wcslen,
+            (uchar_t *)mbs, &mbslen, UCONV_IN_LITTLE_ENDIAN);
+        if (err != 0)
+                return ((size_t)-1);
+
+        if (mbslen < nbytes)
+                mbs[mbslen] = '\0';
+
+        return (mbslen);
 }
 
 
 /*
  * Returns the number of bytes that would be written if the multi-
@@ -254,153 +236,241 @@
  * counting the terminating null wide character.
  */
 size_t
 smb_wcequiv_strlen(const char *mbs)
 {
-        smb_wchar_t     wide_char;
+        uint32_t        wide_char;
         size_t bytes;
         size_t len = 0;
 
         while (*mbs) {
                 bytes = smb_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX);
                 if (bytes == ((size_t)-1))
                         return ((size_t)-1);
+                mbs += bytes;
 
                 len += sizeof (smb_wchar_t);
-                mbs += bytes;
+                if (bytes > 3) {
+                        /*
+                         * Extended unicode, so TWO smb_wchar_t
+                         */
+                        len += sizeof (smb_wchar_t);
         }
+        }
 
         return (len);
 }
 
 
 /*
  * Returns the number of bytes that would be written if the multi-
- * byte string mbs was converted to a single byte character string,
- * not counting the terminating null character.
+ * byte string mbs was converted to an OEM character string,
+ * (smb_mbstooem) not counting the terminating null character.
  */
 size_t
 smb_sbequiv_strlen(const char *mbs)
 {
-        smb_wchar_t     wide_char;
         size_t nbytes;
         size_t len = 0;
 
         while (*mbs) {
-                nbytes = smb_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX);
+                nbytes = smb_mbtowc(NULL, mbs, MTS_MB_CHAR_MAX);
                 if (nbytes == ((size_t)-1))
                         return ((size_t)-1);
+                if (nbytes == 0)
+                        break;
 
-                if (wide_char & 0xFF00)
-                        len += sizeof (smb_wchar_t);
-                else
-                        ++len;
+                if (nbytes == 1) {
+                        /* ASCII */
+                        len++;
+                } else if (nbytes < 8) {
+                        /* Compute OEM length */
+                        char mbsbuf[8];
+                        uint8_t oembuf[8];
+                        int oemlen;
+                        (void) strlcpy(mbsbuf, mbs, nbytes+1);
+                        oemlen = smb_mbstooem(oembuf, mbsbuf, 8);
+                        if (oemlen < 0)
+                                return ((size_t)-1);
+                        len += oemlen;
+                } else {
+                        return ((size_t)-1);
+                }
 
                 mbs += nbytes;
         }
 
         return (len);
 }
 
+/*
+ * Convert OEM strings to/from internal (UTF-8) form.
+ *
+ * We rarely encounter these anymore because all modern
+ * SMB clients use Unicode (UTF-16). The few cases where
+ * this IS still called are normally using ASCII, i.e.
+ * tag names etc. so short-cut those cases.  If we get
+ * something non-ASCII we have to call iconv.
+ *
+ * If we were to really support OEM code pages, we would
+ * need to have a way to set the OEM code page from some
+ * configuration value.  For now it's always CP850.
+ * See also ./smb_oem.c
+ */
+static char smb_oem_codepage[32] = "CP850";
 
 /*
  * stombs
  *
- * Convert a regular null terminated string 'string' to a UTF-8 encoded
- * null terminated multi-byte string 'mbstring'. Only full converted
- * UTF-8 characters will be written 'mbstring'. If a character will not
- * fit within the remaining buffer space or 'mbstring' will overflow
- * max_mblen, the conversion process will be terminated and 'mbstring'
- * will be null terminated.
+ * Convert a null terminated OEM string 'string' to a UTF-8 string
+ * no longer than max_mblen (null terminated if space).
  *
- * Returns the number of bytes written to 'mbstring', excluding the
- * terminating null character.
+ * If the input string contains invalid OEM characters, a value
+ * of -1 will be returned. Otherwise returns the length of 'mbs',
+ * excluding the terminating null character.
  *
  * If either mbstring or string is a null pointer, -1 is returned.
  */
 int
-smb_stombs(char *mbstring, char *string, int max_mblen)
+smb_oemtombs(char *mbs, const uint8_t *oems, int max_mblen)
 {
-        char *start = mbstring;
-        unsigned char *p = (unsigned char *)string;
-        int space_left = max_mblen;
-        int     len;
-        smb_wchar_t     wide_char;
-        char buf[4];
+        uchar_t *p;
+        int     oemlen;
+        int     rlen;
+        boolean_t need_iconv = B_FALSE;
 
-        if (!mbstring || !string)
+        if (mbs == NULL || oems == NULL)
                 return (-1);
 
-        while (*p && space_left > 2) {
-                wide_char = *p++;
-                len = smb_wctomb(mbstring, wide_char);
-                mbstring += len;
-                space_left -= len;
+        /*
+         * Check if the oems is all ASCII (and get the length
+         * while we're at it) so we know if we need to iconv.
+         * We usually can avoid the iconv calls.
+         */
+        oemlen = 0;
+        p = (uchar_t *)oems;
+        while (*p != '\0') {
+                oemlen++;
+                if (*p & 0x80)
+                        need_iconv = B_TRUE;
+                p++;
         }
 
-        if (*p) {
-                wide_char = *p;
-                if ((len = smb_wctomb(buf, wide_char)) < 2) {
-                        *mbstring = *buf;
-                        mbstring += len;
-                        space_left -= len;
+        if (need_iconv) {
+                int     rc;
+                char    *obuf = mbs;
+                size_t  olen = max_mblen;
+                size_t  ilen = oemlen;
+#if defined(_KERNEL) || defined(_FAKE_KERNEL)
+                char *ibuf = (char *)oems;
+                kiconv_t ic;
+                int     err;
+
+                ic = kiconv_open("UTF-8", smb_oem_codepage);
+                if (ic == (kiconv_t)-1)
+                        goto just_copy;
+                rc = kiconv(ic, &ibuf, &ilen, &obuf, &olen, &err);
+                (void) kiconv_close(ic);
+#else   /* _KERNEL || _FAKE_KERNEL */
+                const char *ibuf = (char *)oems;
+                iconv_t ic;
+                ic = iconv_open("UTF-8", smb_oem_codepage);
+                if (ic == (iconv_t)-1)
+                        goto just_copy;
+                rc = iconv(ic, &ibuf, &ilen, &obuf, &olen);
+                (void) iconv_close(ic);
+#endif  /* _KERNEL || _FAKE_KERNEL */
+                if (rc < 0)
+                        return (-1);
+                /* Return val. is output bytes. */
+                rlen = (max_mblen - olen);
+        } else {
+        just_copy:
+                rlen = oemlen;
+                if (rlen > max_mblen)
+                        rlen = max_mblen;
+                bcopy(oems, mbs, rlen);
                 }
-        }
+        if (rlen < max_mblen)
+                mbs[rlen] = '\0';
 
-        *mbstring = '\0';
-
-        /*LINTED E_PTRDIFF_OVERFLOW*/
-        return (mbstring - start);
+        return (rlen);
 }
 
 
 /*
  * mbstos
  *
- * Convert a null terminated multi-byte string 'mbstring' to a regular
- * null terminated string 'string'.  A 1-byte character in 'mbstring'
- * maps to a 1-byte character in 'string'. A 2-byte character in
- * 'mbstring' will be mapped to 2-bytes, if the upper byte is non-null.
- * Otherwise the upper byte null will be discarded to ensure that the
- * output stream does not contain embedded null characters.
+ * Convert a null terminated multi-byte string 'mbs' to an OEM string
+ * no longer than max_oemlen (null terminated if space).
  *
- * If the input stream contains invalid multi-byte characters, a value
- * of -1 will be returned. Otherwise the length of 'string', excluding
- * the terminating null character, is returned.
+ * If the input string contains invalid multi-byte characters, a value
+ * of -1 will be returned. Otherwise returns the length of 'oems',
+ * excluding the terminating null character.
  *
  * If either mbstring or string is a null pointer, -1 is returned.
  */
 int
-smb_mbstos(char *string, const char *mbstring)
+smb_mbstooem(uint8_t *oems, const char *mbs, int max_oemlen)
 {
-        smb_wchar_t wc;
-        unsigned char *start = (unsigned char *)string;
-        int len;
+        uchar_t *p;
+        int     mbslen;
+        int     rlen;
+        boolean_t need_iconv = B_FALSE;
 
-        if (string == NULL || mbstring == NULL)
+        if (oems == NULL || mbs == NULL)
                 return (-1);
 
-        while (*mbstring) {
-                if ((len = smb_mbtowc(&wc, mbstring, MTS_MB_CHAR_MAX)) < 0) {
-                        *string = 0;
-                        return (-1);
+        /*
+         * Check if the mbs is all ASCII (and get the length
+         * while we're at it) so we know if we need to iconv.
+         * We usually can avoid the iconv calls.
+         */
+        mbslen = 0;
+        p = (uchar_t *)mbs;
+        while (*p != '\0') {
+                mbslen++;
+                if (*p & 0x80)
+                        need_iconv = B_TRUE;
+                p++;
                 }
 
-                if (wc & 0xFF00) {
-                        /*LINTED E_BAD_PTR_CAST_ALIGN*/
-                        *((smb_wchar_t *)string) = wc;
-                        string += sizeof (smb_wchar_t);
-                }
-                else
-                {
-                        *string = (unsigned char)wc;
-                        string++;
-                }
+        if (need_iconv) {
+                int     rc;
+                char    *obuf = (char *)oems;
+                size_t  olen = max_oemlen;
+                size_t  ilen = mbslen;
+#if defined(_KERNEL) || defined(_FAKE_KERNEL)
+                char *ibuf = (char *)mbs;
+                kiconv_t ic;
+                int     err;
 
-                mbstring += len;
+                ic = kiconv_open(smb_oem_codepage, "UTF-8");
+                if (ic == (kiconv_t)-1)
+                        goto just_copy;
+                rc = kiconv(ic, &ibuf, &ilen, &obuf, &olen, &err);
+                (void) kiconv_close(ic);
+#else   /* _KERNEL || _FAKE_KERNEL */
+                const char *ibuf = mbs;
+                iconv_t ic;
+                ic = iconv_open(smb_oem_codepage, "UTF-8");
+                if (ic == (iconv_t)-1)
+                        goto just_copy;
+                rc = iconv(ic, &ibuf, &ilen, &obuf, &olen);
+                (void) iconv_close(ic);
+#endif  /* _KERNEL || _FAKE_KERNEL */
+                if (rc < 0)
+                        return (-1);
+                /* Return val. is output bytes. */
+                rlen = (max_oemlen - olen);
+        } else {
+        just_copy:
+                rlen = mbslen;
+                if (rlen > max_oemlen)
+                        rlen = max_oemlen;
+                bcopy(mbs, oems, rlen);
         }
+        if (rlen < max_oemlen)
+                oems[rlen] = '\0';
 
-        *string = 0;
-
-        /*LINTED E_PTRDIFF_OVERFLOW*/
-        return ((unsigned char *)string - start);
+        return (rlen);
 }