Print this page
NEX-19712 SMB directory listings sometimes wrong after NEX-19025
Reviewed by: Matt Barden <matt.barden@nexenta.com>
Reviewed by: Joyce McIntosh <joyce.mcintosh@nexenta.com>
NEX-19025 CIFS gets confused with filenames containing enhanced Unicode
Reviewed by: Matt Barden <matt.barden@nexenta.com>
Reviewed by: Evan Layton <evan.layton@nexenta.com>
and: (fix build, check-rtime)
NEX-4458 Incorrect directory listing response for non-UNICODE clients
Reviewed by: Matt Barden <Matt.Barden@nexenta.com>
Reviewed by: Kevin Crowe <kevin.crowe@nexenta.com>
NEX-2460 libfksmbd should not link with libsmb
SMB-50 User-mode SMB server
Includes work by these authors:
Thomas Keiser <thomas.keiser@nexenta.com>
Albert Lee <trisk@nexenta.com>
@@ -20,47 +20,29 @@
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*
- * Copyright 2014 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2018 Nexenta Systems, Inc. All rights reserved.
*/
/*
- * Multibyte/wide-char conversion routines. Wide-char encoding provides
- * a fixed size character encoding that maps to the Unicode 16-bit
- * (UCS-2) character set standard. Multibyte or UCS transformation
- * format (UTF) encoding is a variable length character encoding scheme
- * that s compatible with existing ASCII characters and guarantees that
- * the resultant strings do not contain embedded null characters. Both
- * types of encoding provide a null terminator: single byte for UTF-8
- * and a wide-char null for Unicode. See RFC 2044.
- *
- * The table below illustrates the UTF-8 encoding scheme. The letter x
- * indicates bits available for encoding the character value.
- *
- * UCS-2 UTF-8 octet sequence (binary)
- * 0x0000-0x007F 0xxxxxxx
- * 0x0080-0x07FF 110xxxxx 10xxxxxx
- * 0x0800-0xFFFF 1110xxxx 10xxxxxx 10xxxxxx
- *
- * RFC 2044
- * UTF-8,a transformation format of UNICODE and ISO 10646
- * F. Yergeau
- * Alis Technologies
- * October 1996
+ * Multibyte/wide-char conversion routines. SMB uses UTF-16 on the wire
+ * (smb_wchar_t) and we use UTF-8 internally (our multi-byte, or mbs).
*/
#if defined(_KERNEL) || defined(_FAKE_KERNEL)
#include <sys/types.h>
#include <sys/sunddi.h>
-#else
+#else /* _KERNEL || _FAKE_KERNEL */
#include <stdio.h>
#include <stdlib.h>
-#include <assert.h>
#include <strings.h>
-#endif
+#include <iconv.h>
+#include <assert.h>
+#endif /* _KERNEL || _FAKE_KERNEL */
+#include <sys/u8_textprep.h>
#include <smbsrv/string.h>
/*
* mbstowcs
@@ -73,30 +55,41 @@
* Returns the number of wide characters converted, not counting
* any terminating null wide character. Returns -1 if an invalid
* multibyte character is encountered.
*/
size_t
-smb_mbstowcs(smb_wchar_t *wcstring, const char *mbstring, size_t nwchars)
+smb_mbstowcs(smb_wchar_t *wcs, const char *mbs, size_t nwchars)
{
- int len;
- smb_wchar_t *start = wcstring;
+ size_t mbslen, wcslen;
+ int err;
- while (nwchars--) {
- len = smb_mbtowc(wcstring, mbstring, MTS_MB_CHAR_MAX);
- if (len < 0) {
- *wcstring = 0;
+ /* NULL or empty input is allowed. */
+ if (mbs == NULL || *mbs == '\0') {
+ if (wcs != NULL && nwchars > 0)
+ *wcs = 0;
+ return (0);
+ }
+
+ /*
+ * Traditional mbstowcs(3C) allows wcs==NULL to get the length.
+ * SMB never calls it that way, but let's future-proof.
+ */
+ if (wcs == NULL) {
return ((size_t)-1);
}
- if (*mbstring == 0)
- break;
+ mbslen = strlen(mbs);
+ wcslen = nwchars;
+ err = uconv_u8tou16((const uchar_t *)mbs, &mbslen,
+ wcs, &wcslen, UCONV_OUT_LITTLE_ENDIAN);
+ if (err != 0)
+ return ((size_t)-1);
- ++wcstring;
- mbstring += len;
- }
+ if (wcslen < nwchars)
+ wcs[wcslen] = 0;
- return (wcstring - start);
+ return (wcslen);
}
/*
* mbtowc
@@ -111,55 +104,42 @@
* without them being interpreted as characters. If mbchar is null
* mbtowc should return non-zero if the current locale requires shift
* states. Otherwise it should be return 0.
*
* If mbchar is non-null, returns the number of bytes processed in
- * mbchar. If mbchar is invalid, returns -1.
+ * mbchar. If mbchar is null, convert the null (wcharp=0) but
+ * return length zero. If mbchar is invalid, returns -1.
*/
int /*ARGSUSED*/
-smb_mbtowc(smb_wchar_t *wcharp, const char *mbchar, size_t nbytes)
+smb_mbtowc(uint32_t *wcharp, const char *mbchar, size_t nbytes)
{
- unsigned char mbyte;
- smb_wchar_t wide_char;
- int count;
- int bytes_left;
+ uint32_t wide_char;
+ int count, err;
+ size_t mblen;
+ size_t wclen;
if (mbchar == NULL)
return (0); /* no shift states */
- /* 0xxxxxxx -> 1 byte ASCII encoding */
- if (((mbyte = *mbchar++) & 0x80) == 0) {
- if (wcharp)
- *wcharp = (smb_wchar_t)mbyte;
-
- return (mbyte ? 1 : 0);
- }
-
- /* 10xxxxxx -> invalid first byte */
- if ((mbyte & 0x40) == 0)
+ /*
+ * How many bytes in this symbol?
+ */
+ count = u8_validate((char *)mbchar, nbytes, NULL, 0, &err);
+ if (count < 0)
return (-1);
- wide_char = mbyte;
- if ((mbyte & 0x20) == 0) {
- wide_char &= 0x1f;
- bytes_left = 1;
- } else if ((mbyte & 0x10) == 0) {
- wide_char &= 0x0f;
- bytes_left = 2;
- } else {
+ mblen = count;
+ wclen = 1;
+ err = uconv_u8tou32((const uchar_t *)mbchar, &mblen,
+ &wide_char, &wclen, UCONV_OUT_SYSTEM_ENDIAN);
+ if (err != 0)
return (-1);
+ if (wclen == 0) {
+ wide_char = 0;
+ count = 0;
}
- count = 1;
- while (bytes_left--) {
- if (((mbyte = *mbchar++) & 0xc0) != 0x80)
- return (-1);
-
- count++;
- wide_char = (wide_char << 6) | (mbyte & 0x3f);
- }
-
if (wcharp)
*wcharp = wide_char;
return (count);
}
@@ -171,29 +151,31 @@
* The wctomb() function converts a wide character wchar into a multibyte
* character and stores the result in mbchar. The object pointed to by
* mbchar must be large enough to accommodate the multibyte character.
*
* Returns the numberof bytes written to mbchar.
+ * Note: handles null like any 1-byte char.
*/
int
-smb_wctomb(char *mbchar, smb_wchar_t wchar)
+smb_wctomb(char *mbchar, uint32_t wchar)
{
- if ((wchar & ~0x7f) == 0) {
- *mbchar = (char)wchar;
- return (1);
- }
+ char junk[MTS_MB_CUR_MAX+1];
+ size_t mblen;
+ size_t wclen;
+ int err;
- if ((wchar & ~0x7ff) == 0) {
- *mbchar++ = (wchar >> 6) | 0xc0;
- *mbchar = (wchar & 0x3f) | 0x80;
- return (2);
- }
+ if (mbchar == NULL)
+ mbchar = junk;
- *mbchar++ = (wchar >> 12) | 0xe0;
- *mbchar++ = ((wchar >> 6) & 0x3f) | 0x80;
- *mbchar = (wchar & 0x3f) | 0x80;
- return (3);
+ mblen = MTS_MB_CUR_MAX;
+ wclen = 1;
+ err = uconv_u32tou8(&wchar, &wclen, (uchar_t *)mbchar, &mblen,
+ UCONV_IN_SYSTEM_ENDIAN | UCONV_IGNORE_NULL);
+ if (err != 0)
+ return (-1);
+
+ return ((int)mblen);
}
/*
* wcstombs
@@ -203,50 +185,50 @@
* stored in mbstring. Partial multibyte characters at the end of the
* string are not stored. The multibyte character string is null
* terminated if there is room.
*
* Returns the number of bytes converted, not counting the terminating
- * null byte.
+ * null byte. Returns -1 if an invalid WC sequence is encountered.
*/
size_t
-smb_wcstombs(char *mbstring, const smb_wchar_t *wcstring, size_t nbytes)
+smb_wcstombs(char *mbs, const smb_wchar_t *wcs, size_t nbytes)
{
- char *start = mbstring;
- const smb_wchar_t *wcp = wcstring;
- smb_wchar_t wide_char = 0;
- char buf[4];
- size_t len;
+ size_t mbslen, wcslen;
+ int err;
- if ((mbstring == NULL) || (wcstring == NULL))
+ /* NULL or empty input is allowed. */
+ if (wcs == NULL || *wcs == 0) {
+ if (mbs != NULL && nbytes > 0)
+ *mbs = '\0';
return (0);
-
- while (nbytes > MTS_MB_CHAR_MAX) {
- wide_char = *wcp++;
- len = smb_wctomb(mbstring, wide_char);
-
- if (wide_char == 0)
- /*LINTED E_PTRDIFF_OVERFLOW*/
- return (mbstring - start);
-
- mbstring += len;
- nbytes -= len;
}
- while (wide_char && nbytes) {
- wide_char = *wcp++;
- if ((len = smb_wctomb(buf, wide_char)) > nbytes) {
- *mbstring = 0;
- break;
+ /*
+ * Traditional wcstombs(3C) allows mbs==NULL to get the length.
+ * SMB never calls it that way, but let's future-proof.
+ */
+ if (mbs == NULL) {
+ return ((size_t)-1);
}
- bcopy(buf, mbstring, len);
- mbstring += len;
- nbytes -= len;
- }
+ /*
+ * Compute wcslen
+ */
+ wcslen = 0;
+ while (wcs[wcslen] != 0)
+ wcslen++;
- /*LINTED E_PTRDIFF_OVERFLOW*/
- return (mbstring - start);
+ mbslen = nbytes;
+ err = uconv_u16tou8(wcs, &wcslen,
+ (uchar_t *)mbs, &mbslen, UCONV_IN_LITTLE_ENDIAN);
+ if (err != 0)
+ return ((size_t)-1);
+
+ if (mbslen < nbytes)
+ mbs[mbslen] = '\0';
+
+ return (mbslen);
}
/*
* Returns the number of bytes that would be written if the multi-
@@ -254,153 +236,241 @@
* counting the terminating null wide character.
*/
size_t
smb_wcequiv_strlen(const char *mbs)
{
- smb_wchar_t wide_char;
+ uint32_t wide_char;
size_t bytes;
size_t len = 0;
while (*mbs) {
bytes = smb_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX);
if (bytes == ((size_t)-1))
return ((size_t)-1);
+ mbs += bytes;
len += sizeof (smb_wchar_t);
- mbs += bytes;
+ if (bytes > 3) {
+ /*
+ * Extended unicode, so TWO smb_wchar_t
+ */
+ len += sizeof (smb_wchar_t);
}
+ }
return (len);
}
/*
* Returns the number of bytes that would be written if the multi-
- * byte string mbs was converted to a single byte character string,
- * not counting the terminating null character.
+ * byte string mbs was converted to an OEM character string,
+ * (smb_mbstooem) not counting the terminating null character.
*/
size_t
smb_sbequiv_strlen(const char *mbs)
{
- smb_wchar_t wide_char;
size_t nbytes;
size_t len = 0;
while (*mbs) {
- nbytes = smb_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX);
+ nbytes = smb_mbtowc(NULL, mbs, MTS_MB_CHAR_MAX);
if (nbytes == ((size_t)-1))
return ((size_t)-1);
+ if (nbytes == 0)
+ break;
- if (wide_char & 0xFF00)
- len += sizeof (smb_wchar_t);
- else
- ++len;
+ if (nbytes == 1) {
+ /* ASCII */
+ len++;
+ } else if (nbytes < 8) {
+ /* Compute OEM length */
+ char mbsbuf[8];
+ uint8_t oembuf[8];
+ int oemlen;
+ (void) strlcpy(mbsbuf, mbs, nbytes+1);
+ oemlen = smb_mbstooem(oembuf, mbsbuf, 8);
+ if (oemlen < 0)
+ return ((size_t)-1);
+ len += oemlen;
+ } else {
+ return ((size_t)-1);
+ }
mbs += nbytes;
}
return (len);
}
+/*
+ * Convert OEM strings to/from internal (UTF-8) form.
+ *
+ * We rarely encounter these anymore because all modern
+ * SMB clients use Unicode (UTF-16). The few cases where
+ * this IS still called are normally using ASCII, i.e.
+ * tag names etc. so short-cut those cases. If we get
+ * something non-ASCII we have to call iconv.
+ *
+ * If we were to really support OEM code pages, we would
+ * need to have a way to set the OEM code page from some
+ * configuration value. For now it's always CP850.
+ * See also ./smb_oem.c
+ */
+static char smb_oem_codepage[32] = "CP850";
/*
* stombs
*
- * Convert a regular null terminated string 'string' to a UTF-8 encoded
- * null terminated multi-byte string 'mbstring'. Only full converted
- * UTF-8 characters will be written 'mbstring'. If a character will not
- * fit within the remaining buffer space or 'mbstring' will overflow
- * max_mblen, the conversion process will be terminated and 'mbstring'
- * will be null terminated.
+ * Convert a null terminated OEM string 'string' to a UTF-8 string
+ * no longer than max_mblen (null terminated if space).
*
- * Returns the number of bytes written to 'mbstring', excluding the
- * terminating null character.
+ * If the input string contains invalid OEM characters, a value
+ * of -1 will be returned. Otherwise returns the length of 'mbs',
+ * excluding the terminating null character.
*
* If either mbstring or string is a null pointer, -1 is returned.
*/
int
-smb_stombs(char *mbstring, char *string, int max_mblen)
+smb_oemtombs(char *mbs, const uint8_t *oems, int max_mblen)
{
- char *start = mbstring;
- unsigned char *p = (unsigned char *)string;
- int space_left = max_mblen;
- int len;
- smb_wchar_t wide_char;
- char buf[4];
+ uchar_t *p;
+ int oemlen;
+ int rlen;
+ boolean_t need_iconv = B_FALSE;
- if (!mbstring || !string)
+ if (mbs == NULL || oems == NULL)
return (-1);
- while (*p && space_left > 2) {
- wide_char = *p++;
- len = smb_wctomb(mbstring, wide_char);
- mbstring += len;
- space_left -= len;
+ /*
+ * Check if the oems is all ASCII (and get the length
+ * while we're at it) so we know if we need to iconv.
+ * We usually can avoid the iconv calls.
+ */
+ oemlen = 0;
+ p = (uchar_t *)oems;
+ while (*p != '\0') {
+ oemlen++;
+ if (*p & 0x80)
+ need_iconv = B_TRUE;
+ p++;
}
- if (*p) {
- wide_char = *p;
- if ((len = smb_wctomb(buf, wide_char)) < 2) {
- *mbstring = *buf;
- mbstring += len;
- space_left -= len;
+ if (need_iconv) {
+ int rc;
+ char *obuf = mbs;
+ size_t olen = max_mblen;
+ size_t ilen = oemlen;
+#if defined(_KERNEL) || defined(_FAKE_KERNEL)
+ char *ibuf = (char *)oems;
+ kiconv_t ic;
+ int err;
+
+ ic = kiconv_open("UTF-8", smb_oem_codepage);
+ if (ic == (kiconv_t)-1)
+ goto just_copy;
+ rc = kiconv(ic, &ibuf, &ilen, &obuf, &olen, &err);
+ (void) kiconv_close(ic);
+#else /* _KERNEL || _FAKE_KERNEL */
+ const char *ibuf = (char *)oems;
+ iconv_t ic;
+ ic = iconv_open("UTF-8", smb_oem_codepage);
+ if (ic == (iconv_t)-1)
+ goto just_copy;
+ rc = iconv(ic, &ibuf, &ilen, &obuf, &olen);
+ (void) iconv_close(ic);
+#endif /* _KERNEL || _FAKE_KERNEL */
+ if (rc < 0)
+ return (-1);
+ /* Return val. is output bytes. */
+ rlen = (max_mblen - olen);
+ } else {
+ just_copy:
+ rlen = oemlen;
+ if (rlen > max_mblen)
+ rlen = max_mblen;
+ bcopy(oems, mbs, rlen);
}
- }
+ if (rlen < max_mblen)
+ mbs[rlen] = '\0';
- *mbstring = '\0';
-
- /*LINTED E_PTRDIFF_OVERFLOW*/
- return (mbstring - start);
+ return (rlen);
}
/*
* mbstos
*
- * Convert a null terminated multi-byte string 'mbstring' to a regular
- * null terminated string 'string'. A 1-byte character in 'mbstring'
- * maps to a 1-byte character in 'string'. A 2-byte character in
- * 'mbstring' will be mapped to 2-bytes, if the upper byte is non-null.
- * Otherwise the upper byte null will be discarded to ensure that the
- * output stream does not contain embedded null characters.
+ * Convert a null terminated multi-byte string 'mbs' to an OEM string
+ * no longer than max_oemlen (null terminated if space).
*
- * If the input stream contains invalid multi-byte characters, a value
- * of -1 will be returned. Otherwise the length of 'string', excluding
- * the terminating null character, is returned.
+ * If the input string contains invalid multi-byte characters, a value
+ * of -1 will be returned. Otherwise returns the length of 'oems',
+ * excluding the terminating null character.
*
* If either mbstring or string is a null pointer, -1 is returned.
*/
int
-smb_mbstos(char *string, const char *mbstring)
+smb_mbstooem(uint8_t *oems, const char *mbs, int max_oemlen)
{
- smb_wchar_t wc;
- unsigned char *start = (unsigned char *)string;
- int len;
+ uchar_t *p;
+ int mbslen;
+ int rlen;
+ boolean_t need_iconv = B_FALSE;
- if (string == NULL || mbstring == NULL)
+ if (oems == NULL || mbs == NULL)
return (-1);
- while (*mbstring) {
- if ((len = smb_mbtowc(&wc, mbstring, MTS_MB_CHAR_MAX)) < 0) {
- *string = 0;
- return (-1);
+ /*
+ * Check if the mbs is all ASCII (and get the length
+ * while we're at it) so we know if we need to iconv.
+ * We usually can avoid the iconv calls.
+ */
+ mbslen = 0;
+ p = (uchar_t *)mbs;
+ while (*p != '\0') {
+ mbslen++;
+ if (*p & 0x80)
+ need_iconv = B_TRUE;
+ p++;
}
- if (wc & 0xFF00) {
- /*LINTED E_BAD_PTR_CAST_ALIGN*/
- *((smb_wchar_t *)string) = wc;
- string += sizeof (smb_wchar_t);
- }
- else
- {
- *string = (unsigned char)wc;
- string++;
- }
+ if (need_iconv) {
+ int rc;
+ char *obuf = (char *)oems;
+ size_t olen = max_oemlen;
+ size_t ilen = mbslen;
+#if defined(_KERNEL) || defined(_FAKE_KERNEL)
+ char *ibuf = (char *)mbs;
+ kiconv_t ic;
+ int err;
- mbstring += len;
+ ic = kiconv_open(smb_oem_codepage, "UTF-8");
+ if (ic == (kiconv_t)-1)
+ goto just_copy;
+ rc = kiconv(ic, &ibuf, &ilen, &obuf, &olen, &err);
+ (void) kiconv_close(ic);
+#else /* _KERNEL || _FAKE_KERNEL */
+ const char *ibuf = mbs;
+ iconv_t ic;
+ ic = iconv_open(smb_oem_codepage, "UTF-8");
+ if (ic == (iconv_t)-1)
+ goto just_copy;
+ rc = iconv(ic, &ibuf, &ilen, &obuf, &olen);
+ (void) iconv_close(ic);
+#endif /* _KERNEL || _FAKE_KERNEL */
+ if (rc < 0)
+ return (-1);
+ /* Return val. is output bytes. */
+ rlen = (max_oemlen - olen);
+ } else {
+ just_copy:
+ rlen = mbslen;
+ if (rlen > max_oemlen)
+ rlen = max_oemlen;
+ bcopy(mbs, oems, rlen);
}
+ if (rlen < max_oemlen)
+ oems[rlen] = '\0';
- *string = 0;
-
- /*LINTED E_PTRDIFF_OVERFLOW*/
- return ((unsigned char *)string - start);
+ return (rlen);
}