Velocity Reviews - Computer Hardware Reviews

Velocity Reviews > Newsgroups > Programming > C Programming > wcstombs() problem

Reply
Thread Tools

wcstombs() problem

 
 
arnuld
Guest
Posts: n/a
 
      02-23-2012
AIM: To convert a wide-character string into a character string
PROBLEM: (1) checking return value or errno.
(2) conversion just does not happen.




#include <stdio.h>
#include <stdlib.h>
#include <wchar.h>
#include <errno.h>
#include <string.h>
#include <locale.h>
#include <limits.h>

#ifndef __STDC_ISO_10646__
#define __STDC_ISO_10646__
#endif

enum {
VAL_SUCC = 0,
VAL_ERR = -1,
ERR_ENC = -101,
ERR_ERRNO_UNKNOWN = -102,
SIZE_INPUT = 1000
};


void setLocale(const char* t);
size_t WStr2CStr(char **s, const wchar_t* ws, const size_t len);
void getGermanLanguageFromFile(wchar_t arr[], const size_t len);
char* myMalloc(const size_t len);


int main(void)
{
int ret = 0;
wchar_t contents[SIZE_INPUT+1] = {0};
char* p = NULL;
size_t plen = 0;

setLocale("en_US.utf8");
getGermanLanguageFromFile(contents, SIZE_INPUT);
printf("Contents = {%ls}\n\n", contents);


plen = ret + 1;
printf("plen = %d\n", plen);
p = myMalloc(plen);
ret = WStr2CStr(&p, contents, plen);

if( ret <= 0)
{
printf("IN: %s @%d ERROR converting to characters: ERRNO = %d\n",
__FILE__, __LINE__, ret);
exit(EXIT_FAILURE);
}

printf("p = [%s]\n\n", p);
printf("W = %zu, Char = %zu\n", wcslen(contents), strlen(p));

free(p);


return 0;
}

char* myMalloc(const size_t len)
{
char* p = malloc(len * (sizeof *p));
if(NULL == p)
{
printf("IN: %s @%d Out of Memory\n", __FILE__, __LINE__);
exit(EXIT_FAILURE);
}

return p;
}


void setLocale(const char* t)
{
if(NULL == setlocale(LC_CTYPE, t))
{
printf("IN: %s @%d ERROR: can not set locale [%s]\n", __FILE__,
__LINE__, t);
exit(EXIT_FAILURE);
}
else
{
printf("IN: %s @%d Locale Set = [%s]\n", __FILE__, __LINE__, t);
}
}

/* Contents of german.txt: Megaupload-Gründer Schmitz gegen Kaution auf
freiem Fuß. News */
void getGermanLanguageFromFile(wchar_t arr[], const size_t len)
{
const char* filename = "german.txt";
FILE* fp;
wchar_t* retp;

fp = fopen(filename,"r");
if(NULL == fp)
{
printf("IN:%s @%d: ERROR [%s]Can not open (%s)\n",__FILE__,
__LINE__, strerror(errno), filename);
return;
}

errno = 0;
retp = fgetws(arr, len, fp);

if(NULL == retp)
{
if(feof(fp))
{
printf("IN: %s @%d Reading Error: [%s]\n", __FILE__, __LINE__,
strerror(errno));
}
else
{
printf("IN: %s @%d Reading Error: ERRNO = %d\n", __FILE__,
__LINE__, errno);
}
exit(EXIT_FAILURE);
}
}




size_t WStr2CStr(char **s, const wchar_t* ws, const size_t len)
{
int ret = VAL_ERR;

if(NULL == ws || NULL == s)
{
ret = VAL_ERR;
}
else
{
size_t r;
errno = 0;
r = wcstombs(*s, ws, len);
if(0 >= r)
{
if(EILSEQ == errno)
{
ret = ERR_ENC;
}
else if(0 == r)
{
printf("IN: %s @%d ERROR: ZERO bytes converted = %zu\n",
__FILE__, __LINE__, r);
ret = VAL_ERR;
}
else if(errno)
{
ret = ERR_ERRNO_UNKNOWN;
}
}
else
{
printf("IN: %s @%d bytes converted = %zu\n", __FILE__,
__LINE__, r);
ret = r;
}
}

return ret;
}

==================== OUTPUT ==============================
[arnuld@dune C]$ gcc -std=c99 -pedantic -Wall -Wextra convert.c
[arnuld@dune C]$ ./a.out
IN: convert.c @82 Locale Set = [en_US.utf8]
Contents = {Megaupload-Gründer Schmitz gegen Kaution auf freiem Fuß. News}

plen = 1
IN: convert.c @151 bytes converted = 1
p = [M]

W = 61, Char = 1
[arnuld@dune C]$





I searched archives and came across this piece of code where poster calls
wcstombs() 2 times, first to calculate characters (using NULL argument)
and then to really do the conversion. I wonder if that is the way wcstombs
() was supposed to use (because it works while mine does not):

size_t n = wcstombs(NULL, src, 0);
char *dst = malloc(n + 1);
if(dst == NULL)
{
fprintf(stderr, "memory allocation failed\n");
return NULL;
}
if(wcstombs(dst, src, n + 1) != n)
{
fprintf(stderr, "conversion failed\n");
free(dst);
return NULL;
}



--
arnuld
http://LispMachine.Wordpress.com
 
Reply With Quote
 
 
 
 
arnuld
Guest
Posts: n/a
 
      02-23-2012
> On Thu, 23 Feb 2012 05:35:41 +0000, arnuld wrote:

> ... SNIP..
> int main(void)
> {
> int ret = 0;
> wchar_t contents[SIZE_INPUT+1] = {0}; char* p = NULL;
> size_t plen = 0;
>
> setLocale("en_US.utf8");
> getGermanLanguageFromFile(contents, SIZE_INPUT); printf("Contents =
> {%ls}\n\n", contents);
>
>
> plen = ret + 1;


was stupid enough to do that, changing it to

plen = wcslen(contents) + 1;

does the conversion but still it misses some last characters, any idea
why ?



--
arnuld
http://LispMachine.Wordpress.com
 
Reply With Quote
 
 
 
 
Keith Thompson
Guest
Posts: n/a
 
      02-23-2012
arnuld <(E-Mail Removed)> writes:
[...]
> #ifndef __STDC_ISO_10646__
> #define __STDC_ISO_10646__
> #endif


__STDC_ISO_10646__ is conditionally defined by the implementation.
Defining it yourself won't give you the desired semantics.

--
Keith Thompson (The_Other_Keith) http://www.velocityreviews.com/forums/(E-Mail Removed) <http://www.ghoti.net/~kst>
Will write code for food.
"We must do something. This is something. Therefore, we must do this."
-- Antony Jay and Jonathan Lynn, "Yes Minister"
 
Reply With Quote
 
Barry Schwarz
Guest
Posts: n/a
 
      02-23-2012
On 23 Feb 2012 05:35:41 GMT, arnuld <(E-Mail Removed)> wrote:

>AIM: To convert a wide-character string into a character string


Why do you think this is possible?

>PROBLEM: (1) checking return value or errno.
> (2) conversion just does not happen.
>
>
>
>
>#include <stdio.h>
>#include <stdlib.h>
>#include <wchar.h>
>#include <errno.h>
>#include <string.h>
>#include <locale.h>
>#include <limits.h>
>
>#ifndef __STDC_ISO_10646__
>#define __STDC_ISO_10646__
>#endif
>
>enum {
> VAL_SUCC = 0,
> VAL_ERR = -1,
> ERR_ENC = -101,
> ERR_ERRNO_UNKNOWN = -102,
> SIZE_INPUT = 1000
>};
>
>
>void setLocale(const char* t);
>size_t WStr2CStr(char **s, const wchar_t* ws, const size_t len);
>void getGermanLanguageFromFile(wchar_t arr[], const size_t len);
>char* myMalloc(const size_t len);
>
>
>int main(void)
>{
> int ret = 0;
> wchar_t contents[SIZE_INPUT+1] = {0};
> char* p = NULL;
> size_t plen = 0;
>
> setLocale("en_US.utf8");
> getGermanLanguageFromFile(contents, SIZE_INPUT);
> printf("Contents = {%ls}\n\n", contents);
>
>
> plen = ret + 1;


Fixed in follow-on message to
plen = wcslen(contents)+1;

> printf("plen = %d\n", plen);
> p = myMalloc(plen);
> ret = WStr2CStr(&p, contents, plen);


WStr2CStr returns a size_t which is unsigned.

>
> if( ret <= 0)


Therefore, ret can never be negative.

> {
> printf("IN: %s @%d ERROR converting to characters: ERRNO = %d\n",
>__FILE__, __LINE__, ret);
> exit(EXIT_FAILURE);
> }
>
> printf("p = [%s]\n\n", p);
> printf("W = %zu, Char = %zu\n", wcslen(contents), strlen(p));
>
> free(p);
>
>
> return 0;
>}
>
>char* myMalloc(const size_t len)
>{
> char* p = malloc(len * (sizeof *p));
> if(NULL == p)
> {
> printf("IN: %s @%d Out of Memory\n", __FILE__, __LINE__);
> exit(EXIT_FAILURE);
> }
>
> return p;
>}
>
>
>void setLocale(const char* t)
>{
> if(NULL == setlocale(LC_CTYPE, t))
> {
> printf("IN: %s @%d ERROR: can not set locale [%s]\n", __FILE__,
>__LINE__, t);
> exit(EXIT_FAILURE);
> }
> else
> {
> printf("IN: %s @%d Locale Set = [%s]\n", __FILE__, __LINE__, t);
> }
>}
>
>/* Contents of german.txt: Megaupload-Grnder Schmitz gegen Kaution auf
>freiem Fu. News */
>void getGermanLanguageFromFile(wchar_t arr[], const size_t len)
>{
> const char* filename = "german.txt";
> FILE* fp;
> wchar_t* retp;
>
> fp = fopen(filename,"r");
> if(NULL == fp)
> {
> printf("IN:%s @%d: ERROR [%s]Can not open (%s)\n",__FILE__,
>__LINE__, strerror(errno), filename);
> return;
> }
>
> errno = 0;
> retp = fgetws(arr, len, fp);
>
> if(NULL == retp)
> {
> if(feof(fp))
> {
> printf("IN: %s @%d Reading Error: [%s]\n", __FILE__, __LINE__,
>strerror(errno));
> }
> else
> {
> printf("IN: %s @%d Reading Error: ERRNO = %d\n", __FILE__,
>__LINE__, errno);
> }
> exit(EXIT_FAILURE);
> }
>}
>
>
>
>
>size_t WStr2CStr(char **s, const wchar_t* ws, const size_t len)
>{
> int ret = VAL_ERR;
>
> if(NULL == ws || NULL == s)
> {
> ret = VAL_ERR;
> }
> else
> {
> size_t r;
> errno = 0;
> r = wcstombs(*s, ws, len);


wcstombs does not convert wide to char. It converts wide to
multi-byte. Multi-byte characters can occupy one or TWO bytes. At
least two of the wide characters you read in from the file ( and )
appear to require two bytes. Consequently, at least two of the last
characters from the original message will not fit in the first len
characters pointed to by *s.

> if(0 >= r)


r is a size_t and therefore can never be negative.

> {
> if(EILSEQ == errno)
> {
> ret = ERR_ENC;
> }
> else if(0 == r)
> {
> printf("IN: %s @%d ERROR: ZERO bytes converted = %zu\n",
>__FILE__, __LINE__, r);
> ret = VAL_ERR;
> }
> else if(errno)
> {
> ret = ERR_ERRNO_UNKNOWN;
> }
> }
> else
> {
> printf("IN: %s @%d bytes converted = %zu\n", __FILE__,
>__LINE__, r);
> ret = r;
> }
> }
>
> return ret;
>}
>
>==================== OUTPUT ==============================
>[arnuld@dune C]$ gcc -std=c99 -pedantic -Wall -Wextra convert.c
>[arnuld@dune C]$ ./a.out
>IN: convert.c @82 Locale Set = [en_US.utf8]
>Contents = {Megaupload-Grnder Schmitz gegen Kaution auf freiem Fu. News}
>
>plen = 1
>IN: convert.c @151 bytes converted = 1
>p = [M]
>
>W = 61, Char = 1
>[arnuld@dune C]$
>
>
>
>
>
>I searched archives and came across this piece of code where poster calls
>wcstombs() 2 times, first to calculate characters (using NULL argument)
>and then to really do the conversion. I wonder if that is the way wcstombs
>() was supposed to use (because it works while mine does not):
>
> size_t n = wcstombs(NULL, src, 0);
> char *dst = malloc(n + 1);
> if(dst == NULL)
> {
> fprintf(stderr, "memory allocation failed\n");
> return NULL;
> }
> if(wcstombs(dst, src, n + 1) != n)
> {
> fprintf(stderr, "conversion failed\n");
> free(dst);
> return NULL;
> }


--
Remove del for email
 
Reply With Quote
 
Ben Bacarisse
Guest
Posts: n/a
 
      02-23-2012
arnuld <(E-Mail Removed)> writes:

>> On Thu, 23 Feb 2012 05:35:41 +0000, arnuld wrote:

>
>> ... SNIP..
>> int main(void)
>> {
>> int ret = 0;
>> wchar_t contents[SIZE_INPUT+1] = {0}; char* p = NULL;
>> size_t plen = 0;
>>
>> setLocale("en_US.utf8");
>> getGermanLanguageFromFile(contents, SIZE_INPUT); printf("Contents =
>> {%ls}\n\n", contents);
>>
>>
>> plen = ret + 1;

>
> was stupid enough to do that, changing it to
>
> plen = wcslen(contents) + 1;
>
> does the conversion but still it misses some last characters, any idea
> why ?


Barry Schwarz has answered this, as have you! The "mbs" at the end of
wcstombs stands for "multi-byte string". That means that some
characters need more than once byte to be encoded so the buffer size
needed is rarely wcslen(contents) + 1.

In your original post, you said that you'd seen code that calls wcstombs
twice -- once to get the length and again to do the conversion and you
asked "I wonder if that is the way wcstombs () was supposed to use
(because it works while mine does not)". The answer is "yes".

--
Ben.
 
Reply With Quote
 
arnuld
Guest
Posts: n/a
 
      02-24-2012
> On Thu, 23 Feb 2012 12:34:29 +0000, Ben Bacarisse wrote:

> Barry Schwarz has answered this, as have you! The "mbs" at the end of
> wcstombs stands for "multi-byte string". That means that some
> characters need more than once byte to be encoded so the buffer size
> needed is rarely wcslen(contents) + 1.


I worked for 2 days to understand this. Wrote several examples, read
several examples, read archives. Since H&S5 does not mention this, which
meant it must be easily understandable. I feel like 'still a kid in C
programming'.


> In your original post, you said that you'd seen code that calls wcstombs
> twice -- once to get the length and again to do the conversion and you
> asked "I wonder if that is the way wcstombs () was supposed to use
> (because it works while mine does not)". The answer is "yes".


What about mblen() to calculate length as alternative ? I tried it with
this code but it does not work:

#include <stdio.h>
#include <stdlib.h>
#include <wchar.h>
#include <errno.h>
#include <string.h>
#include <locale.h>

enum {
VAL_SUCC = 0,
VAL_ERR = -1,
ERR_ENC = -101,
ERR_ERRNO_UNKNOWN = -102,
SIZE_NAME = 10,
SIZE_INPUT = 1000,
SIZE_READ = 1};


void mySetLocale(const char* t);
size_t WStr2CStr(char **s, const wchar_t* ws, const size_t len);
void get_InternationlText_from_file(wchar_t arr[], const size_t len);
char* myMalloc(const size_t len);

int main(void)
{
wchar_t contents[SIZE_INPUT+1] = {0};
char arr[SIZE_INPUT+1] = {0};
size_t wlen = 0;
int len = 0;

mySetLocale("en_US.utf8");
get_InternationlText_from_file(contents, SIZE_INPUT);
printf("Contents = {%ls}\n\n", contents);

wlen = wcslen(contents);
len = mblen(arr, SIZE_INPUT+1);

printf("IN: %s @%d: wlen = %zu, len = %d\n", __FILE__, __LINE__, wlen,
len);

return 0;
}



void mySetLocale(const char* t)
{
if(NULL == setlocale(LC_CTYPE, t))
{
printf("IN: %s @%d ERROR: can not set locale [%s]\n", __FILE__,
__LINE__, t);
exit(EXIT_FAILURE);
}
else
{
printf("IN: %s @%d Locale Set = [%s]\n", __FILE__, __LINE__, t);
}
}


void get_InternationlText_from_file(wchar_t arr[], const size_t len)
{
const char* filename = "german.txt";
FILE* fp;
wchar_t* retp;

fp = fopen(filename,"r");
if(NULL == fp)
{
printf("IN:%s @%d: ERROR [%s]Can not open (%s)\n",__FILE__,
__LINE__, strerror(errno), filename);
return;
}

errno = 0;
retp = fgetws(arr, len, fp);

if(NULL == retp)
{
if(feof(fp))
{
printf("IN: %s @%d Reading Error: [%s]\n", __FILE__, __LINE__,
strerror(errno));
}
else
{
printf("IN: %s @%d Reading Error: ERRNO = %d\n", __FILE__,
__LINE__, errno);
}
exit(EXIT_FAILURE);
}
}




size_t WStr2CStr(char **s, const wchar_t* ws, const size_t len)
{
size_t ret = VAL_ERR;

if(NULL == ws || NULL == s)
{
ret = VAL_ERR;
}
else
{
size_t r;
errno = 0;
r = wcstombs(*s, ws, len);

if(0 >= r)
{
if(EILSEQ == errno)
{
ret = ERR_ENC;
}
else if(0 == r)
{
printf("IN: %s @%d ERROR bytes converted = %zu\n",
__FILE__, __LINE__, r);
ret = VAL_ERR;
}
else if(errno)
{
ret = ERR_ERRNO_UNKNOWN;
}
}
else
{
printf("IN: %s @%d bytes converted = %zu\n", __FILE__,
__LINE__, r);
ret = r;
}
}

return ret;
}

==================== OUTPUT ============================
[arnuld@dune C]$ gcc -std=c99 -pedantic -Wall -Wextra mblen.c
[arnuld@dune C]$ ./a.out
IN: mblen.c @53 Locale Set = [en_US.utf8]
Contents = {Megaupload-Gründer Schmitz gegen Kaution auf freiem Fuß. News}

IN: mblen.c @37: wlen = 61, len = 0
[arnuld@dune C]$



--
arnuld
http://LispMachine.Wordpress.com
 
Reply With Quote
 
arnuld
Guest
Posts: n/a
 
      02-24-2012
>> arnuld wrote:
> On Thu, 23 Feb 2012 00:50:26 -0800, Barry Schwarz wrote:


> WStr2CStr returns a size_t which is unsigned.
>
>> if( ret <= 0)

>
> Therefore, ret can never be negative.


I know that an dyou have explained it very well. Problem is how do I
compete with statement from section 16.11.2 from H&S5:

"The function returns the number of characters written to s, not
counting the terminating null character(if any). If a conversion error
occurs, the function returns -1 (cast to size_t)"


or it returns the value equal to "the value of -1 catst-ed to size_t ?
(which is 4294967295 on mu machine)





--
arnuld
http://LispMachine.Wordpress.com
 
Reply With Quote
 
Ike Naar
Guest
Posts: n/a
 
      02-24-2012
On 2012-02-24, arnuld <(E-Mail Removed)> wrote:
>> In your original post, you said that you'd seen code that calls wcstombs
>> twice -- once to get the length and again to do the conversion and you
>> asked "I wonder if that is the way wcstombs () was supposed to use
>> (because it works while mine does not)". The answer is "yes".

>
> What about mblen() to calculate length as alternative ? I tried it with
> this code but it does not work:
>
> [snip]
>
> int main(void)
> {
> wchar_t contents[SIZE_INPUT+1] = {0};
> char arr[SIZE_INPUT+1] = {0};
> size_t wlen = 0;
> int len = 0;
>
> mySetLocale("en_US.utf8");
> get_InternationlText_from_file(contents, SIZE_INPUT);
> printf("Contents = {%ls}\n\n", contents);
>
> wlen = wcslen(contents);
> len = mblen(arr, SIZE_INPUT+1);


It seems there is something missing from this code.
As it is written, mblen is applied to arr which contains all zeroes.

> printf("IN: %s @%d: wlen = %zu, len = %d\n", __FILE__, __LINE__, wlen,
> len);
>
> return 0;
> }

 
Reply With Quote
 
Nobody
Guest
Posts: n/a
 
      02-24-2012
On Fri, 24 Feb 2012 05:32:10 +0000, arnuld wrote:

> What about mblen() to calculate length as alternative ?


mblen() requires that you have already converted the string to multi-byte
representation.

The "correct" answer is the one you noted in your original post: call
wcstombs() with NULL as the first argument to calculate the length of the
resulting multi-byte string. That feature was added for this specific
purpose.

 
Reply With Quote
 
Ben Bacarisse
Guest
Posts: n/a
 
      02-24-2012
arnuld <(E-Mail Removed)> writes:

>>> arnuld wrote:

>> On Thu, 23 Feb 2012 00:50:26 -0800, Barry Schwarz wrote:

>
>> WStr2CStr returns a size_t which is unsigned.
>>
>>> if( ret <= 0)

>>
>> Therefore, ret can never be negative.

>
> I know that an dyou have explained it very well. Problem is how do I
> compete with statement from section 16.11.2 from H&S5:
>
> "The function returns the number of characters written to s, not
> counting the terminating null character(if any). If a conversion error
> occurs, the function returns -1 (cast to size_t)"


You compare the return value with -1 cast to size_t:

if (ret == (size_t)-1)

<snip>
--
Ben.
 
Reply With Quote
 
 
 
Reply

Thread Tools

Posting Rules
You may not post new threads
You may not post replies
You may not post attachments
You may not edit your posts

BB code is On
Smilies are On
[IMG] code is On
HTML code is Off
Trackbacks are On
Pingbacks are On
Refbacks are Off


Similar Threads
Thread Thread Starter Forum Replies Last Post
Problem problem problem :( Need Help Mike ASP General 2 05-11-2004 08:36 AM



Advertisments