1 : /**********************************************************************
2 : * $Id: cpl_recode_stub.cpp 17405 2009-07-17 06:13:24Z chaitanya $
3 : *
4 : * Name: cpl_recode.cpp
5 : * Project: CPL - Common Portability Library
6 : * Purpose: Character set recoding and char/wchar_t conversions.
7 : * Author: Frank Warmerdam, warmerdam@pobox.com
8 : *
9 : * The bulk of this code is derived from the utf.c module from FLTK. It
10 : * was originally downloaded from:
11 : * http://svn.easysw.com/public/fltk/fltk/trunk/src/utf.c
12 : *
13 : **********************************************************************
14 : * Copyright (c) 2008, Frank Warmerdam
15 : * Copyright 2006 by Bill Spitzak and others.
16 : *
17 : * Permission to use, copy, modify, and distribute this software for any
18 : * purpose with or without fee is hereby granted, provided that the above
19 : * copyright notice and this permission notice appear in all copies.
20 : *
21 : * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
22 : * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
23 : * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
24 : * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
25 : * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
26 : * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
27 : * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
28 : **********************************************************************/
29 :
30 : #include "cpl_string.h"
31 :
32 : CPL_CVSID("$Id: cpl_recode_stub.cpp 17405 2009-07-17 06:13:24Z chaitanya $");
33 :
34 : #define CPL_RECODE_STUB
35 :
36 : #ifdef CPL_RECODE_STUB
37 :
38 : static unsigned utf8decode(const char* p, const char* end, int* len);
39 : static unsigned utf8towc(const char* src, unsigned srclen,
40 : wchar_t* dst, unsigned dstlen);
41 : static unsigned utf8toa(const char* src, unsigned srclen,
42 : char* dst, unsigned dstlen);
43 : static unsigned utf8fromwc(char* dst, unsigned dstlen,
44 : const wchar_t* src, unsigned srclen);
45 : static unsigned utf8froma(char* dst, unsigned dstlen,
46 : const char* src, unsigned srclen);
47 : static int utf8test(const char* src, unsigned srclen);
48 :
49 : #ifdef FUTURE_NEEDS
50 : static const char* utf8fwd(const char* p, const char* start, const char* end);
51 : static const char* utf8back(const char* p, const char* start, const char*end);
52 : static int utf8encode(unsigned ucs, char* buf);
53 : static int utf8bytes(unsigned ucs);
54 : #endif /* def FUTURE_NEEDS */
55 :
56 : /************************************************************************/
57 : /* ==================================================================== */
58 : /* Stub Implementation not depending on iconv() or WIN32 API. */
59 : /* ==================================================================== */
60 : /************************************************************************/
61 :
62 : /************************************************************************/
63 : /* CPLRecode() */
64 : /************************************************************************/
65 :
66 : /**
67 : * Convert a string from a source encoding to a destination encoding.
68 : *
69 : * The only guaranteed supported encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
70 : * and CPL_ENC_ISO8859_1. Currently, the following conversions are supported :
71 : * <ul>
72 : * <li>CPL_ENC_ASCII -> CPL_ENC_UTF8 or CPL_ENC_ISO8859_1 (no conversion in fact)</li>
73 : * <li>CPL_ENC_ISO8859_1 -> CPL_ENC_UTF8</li>
74 : * <li>CPL_ENC_UTF8 -> CPL_ENC_ISO8859_1</li>
75 : * </ul>
76 : *
77 : * If an error occurs an error may, or may not be posted with CPLError().
78 : *
79 : * @param pszSource a NUL terminated string.
80 : * @param pszSrcEncoding the source encoding.
81 : * @param pszDstEncoding the destination encoding.
82 : *
83 : * @return a NUL terminated string which should be freed with CPLFree().
84 : *
85 : * @since GDAL 1.6.0
86 : */
87 :
88 0 : char CPL_DLL *CPLRecode( const char *pszSource,
89 : const char *pszSrcEncoding,
90 : const char *pszDstEncoding )
91 :
92 : {
93 : /* -------------------------------------------------------------------- */
94 : /* Handle a few common short cuts. */
95 : /* -------------------------------------------------------------------- */
96 0 : if( strcmp(pszSrcEncoding,pszDstEncoding) == 0 )
97 0 : return CPLStrdup(pszSource);
98 :
99 0 : if( strcmp(pszSrcEncoding,CPL_ENC_ASCII) == 0
100 : && (strcmp(pszDstEncoding,CPL_ENC_UTF8) == 0
101 : || strcmp(pszDstEncoding,CPL_ENC_ISO8859_1) == 0) )
102 0 : return CPLStrdup(pszSource);
103 :
104 : /* -------------------------------------------------------------------- */
105 : /* If the source or destination is current locale(), we change */
106 : /* it to ISO8859-1 since our stub implementation does not */
107 : /* attempt to address locales properly. */
108 : /* -------------------------------------------------------------------- */
109 :
110 0 : if( pszSrcEncoding[0] == '\0' )
111 0 : pszSrcEncoding = CPL_ENC_ISO8859_1;
112 :
113 0 : if( pszDstEncoding[0] == '\0' )
114 0 : pszDstEncoding = CPL_ENC_ISO8859_1;
115 :
116 : /* -------------------------------------------------------------------- */
117 : /* ISO8859 to UTF8 */
118 : /* -------------------------------------------------------------------- */
119 0 : if( strcmp(pszSrcEncoding,CPL_ENC_ISO8859_1) == 0
120 : && strcmp(pszDstEncoding,CPL_ENC_UTF8) == 0 )
121 : {
122 0 : int nCharCount = strlen(pszSource);
123 0 : char *pszResult = (char *) CPLCalloc(1,nCharCount*2+1);
124 :
125 0 : utf8froma( pszResult, nCharCount*2+1, pszSource, nCharCount );
126 :
127 0 : return pszResult;
128 : }
129 :
130 : /* -------------------------------------------------------------------- */
131 : /* UTF8 to ISO8859 */
132 : /* -------------------------------------------------------------------- */
133 0 : if( strcmp(pszSrcEncoding,CPL_ENC_UTF8) == 0
134 : && strcmp(pszDstEncoding,CPL_ENC_ISO8859_1) == 0 )
135 : {
136 0 : int nCharCount = strlen(pszSource);
137 0 : char *pszResult = (char *) CPLCalloc(1,nCharCount+1);
138 :
139 0 : utf8toa( pszSource, nCharCount, pszResult, nCharCount+1 );
140 :
141 0 : return pszResult;
142 : }
143 :
144 : /* -------------------------------------------------------------------- */
145 : /* Anything else to UTF-8 is treated as ISO8859-1 to UTF-8 with */
146 : /* a one-time warning. */
147 : /* -------------------------------------------------------------------- */
148 0 : if( strcmp(pszDstEncoding,CPL_ENC_UTF8) == 0 )
149 : {
150 0 : int nCharCount = strlen(pszSource);
151 0 : char *pszResult = (char *) CPLCalloc(1,nCharCount*2+1);
152 : static int bHaveWarned = FALSE;
153 :
154 0 : if( !bHaveWarned )
155 : {
156 0 : bHaveWarned = 1;
157 : CPLError( CE_Warning, CPLE_AppDefined,
158 : "Recode from %s to UTF-8 not supported, treated as ISO8859-1 to UTF-8.",
159 0 : pszSrcEncoding );
160 : }
161 :
162 0 : utf8froma( pszResult, nCharCount*2+1, pszSource, nCharCount );
163 :
164 0 : return pszResult;
165 : }
166 :
167 : /* -------------------------------------------------------------------- */
168 : /* UTF-8 to anything else is treated as UTF-8 to ISO-8859-1 */
169 : /* with a warning. */
170 : /* -------------------------------------------------------------------- */
171 0 : if( strcmp(pszSrcEncoding,CPL_ENC_UTF8) == 0
172 : && strcmp(pszDstEncoding,CPL_ENC_ISO8859_1) == 0 )
173 : {
174 0 : int nCharCount = strlen(pszSource);
175 0 : char *pszResult = (char *) CPLCalloc(1,nCharCount+1);
176 : static int bHaveWarned = FALSE;
177 :
178 0 : if( !bHaveWarned )
179 : {
180 0 : bHaveWarned = 1;
181 : CPLError( CE_Warning, CPLE_AppDefined,
182 : "Recode from UTF-8 to %s not supported, treated as UTF-8 to ISO8859-1.",
183 0 : pszDstEncoding );
184 : }
185 :
186 0 : utf8toa( pszSource, nCharCount, pszResult, nCharCount+1 );
187 :
188 0 : return pszResult;
189 : }
190 :
191 : /* -------------------------------------------------------------------- */
192 : /* Everything else is treated as a no-op with a warning. */
193 : /* -------------------------------------------------------------------- */
194 : {
195 : static int bHaveWarned = FALSE;
196 :
197 0 : if( !bHaveWarned )
198 : {
199 0 : bHaveWarned = 1;
200 : CPLError( CE_Warning, CPLE_AppDefined,
201 : "Recode from %s to %s not supported, no change applied.",
202 0 : pszSrcEncoding, pszDstEncoding );
203 : }
204 :
205 0 : return CPLStrdup(pszSource);
206 : }
207 : }
208 :
209 : /************************************************************************/
210 : /* CPLRecodeFromWChar() */
211 : /************************************************************************/
212 :
213 : /**
214 : * Convert wchar_t string to UTF-8.
215 : *
216 : * Convert a wchar_t string into a multibyte utf-8 string. The only
217 : * guaranteed supported source encoding is CPL_ENC_UCS2, and the only
218 : * guaranteed supported destination encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
219 : * and CPL_ENC_ISO8859_1. In some cases (ie. using iconv()) other encodings
220 : * may also be supported.
221 : *
222 : * Note that the wchar_t type varies in size on different systems. On
223 : * win32 it is normally 2 bytes, and on unix 4 bytes.
224 : *
225 : * If an error occurs an error may, or may not be posted with CPLError().
226 : *
227 : * @param pwszSource the source wchar_t string, terminated with a 0 wchar_t.
228 : * @param pszSrcEncoding the source encoding, typically CPL_ENC_UCS2.
229 : * @param pszDstEncoding the destination encoding, typically CPL_ENC_UTF8.
230 : *
231 : * @return a zero terminated multi-byte string which should be freed with
232 : * CPLFree(), or NULL if an error occurs.
233 : *
234 : * @since GDAL 1.6.0
235 : */
236 :
237 0 : char CPL_DLL *CPLRecodeFromWChar( const wchar_t *pwszSource,
238 : const char *pszSrcEncoding,
239 : const char *pszDstEncoding )
240 :
241 : {
242 : /* -------------------------------------------------------------------- */
243 : /* We try to avoid changes of character set. We are just */
244 : /* providing for unicode to unicode. */
245 : /* -------------------------------------------------------------------- */
246 0 : if( strcmp(pszSrcEncoding,CPL_ENC_UTF8) != 0
247 : && strcmp(pszSrcEncoding,CPL_ENC_UTF16) != 0
248 : && strcmp(pszSrcEncoding,CPL_ENC_UCS2) != 0
249 : && strcmp(pszSrcEncoding,CPL_ENC_UCS4) != 0 )
250 : {
251 : CPLError( CE_Failure, CPLE_AppDefined,
252 : "Stub recoding implementation does not support\n"
253 : "CPLRecodeFromWChar(...,%s,%s)",
254 0 : pszSrcEncoding, pszDstEncoding );
255 0 : return NULL;
256 : }
257 :
258 : /* -------------------------------------------------------------------- */
259 : /* What is the source length. */
260 : /* -------------------------------------------------------------------- */
261 0 : int nSrcLen = 0;
262 :
263 0 : while( pwszSource[nSrcLen] != 0 )
264 0 : nSrcLen++;
265 :
266 : /* -------------------------------------------------------------------- */
267 : /* Allocate destination buffer plenty big. */
268 : /* -------------------------------------------------------------------- */
269 : char *pszResult;
270 : int nDstBufSize, nDstLen;
271 :
272 0 : nDstBufSize = nSrcLen * 4 + 1;
273 0 : pszResult = (char *) CPLMalloc(nDstBufSize); // nearly worst case.
274 :
275 : /* -------------------------------------------------------------------- */
276 : /* Convert, and confirm we had enough space. */
277 : /* -------------------------------------------------------------------- */
278 0 : nDstLen = utf8fromwc( pszResult, nDstBufSize, pwszSource, nSrcLen );
279 0 : if( nDstLen >= nDstBufSize - 1 )
280 : {
281 : CPLAssert( FALSE ); // too small!
282 0 : return NULL;
283 : }
284 :
285 : /* -------------------------------------------------------------------- */
286 : /* If something other than UTF-8 was requested, recode now. */
287 : /* -------------------------------------------------------------------- */
288 0 : if( strcmp(pszDstEncoding,CPL_ENC_UTF8) == 0 )
289 0 : return pszResult;
290 :
291 : char *pszFinalResult =
292 0 : CPLRecode( pszResult, CPL_ENC_UTF8, pszDstEncoding );
293 :
294 0 : CPLFree( pszResult );
295 :
296 0 : return pszFinalResult;
297 : }
298 :
299 : /************************************************************************/
300 : /* CPLRecodeToWChar() */
301 : /************************************************************************/
302 :
303 : /**
304 : * Convert UTF-8 string to a wchar_t string.
305 : *
306 : * Convert a 8bit, multi-byte per character input string into a wide
307 : * character (wchar_t) string. The only guaranteed supported source encodings
308 : * are CPL_ENC_UTF8, CPL_ENC_ASCII and CPL_ENC_ISO8869_1 (LATIN1). The only
309 : * guaranteed supported destination encoding is CPL_ENC_UCS2. Other source
310 : * and destination encodings may be supported depending on the underlying
311 : * implementation.
312 : *
313 : * Note that the wchar_t type varies in size on different systems. On
314 : * win32 it is normally 2 bytes, and on unix 4 bytes.
315 : *
316 : * If an error occurs an error may, or may not be posted with CPLError().
317 : *
318 : * @param pszSource input multi-byte character string.
319 : * @param pszSrcEncoding source encoding, typically CPL_ENC_UTF8.
320 : * @param pszDstEncoding destination encoding, typically CPL_ENC_UCS2.
321 : *
322 : * @return the zero terminated wchar_t string (to be freed with CPLFree()) or
323 : * NULL on error.
324 : *
325 : * @since GDAL 1.6.0
326 : */
327 :
328 0 : wchar_t CPL_DLL *CPLRecodeToWChar( const char *pszSource,
329 : const char *pszSrcEncoding,
330 : const char *pszDstEncoding )
331 :
332 : {
333 0 : char *pszUTF8Source = (char *) pszSource;
334 :
335 0 : if( strcmp(pszSrcEncoding,CPL_ENC_UTF8) != 0
336 : && strcmp(pszSrcEncoding,CPL_ENC_ASCII) != 0 )
337 : {
338 0 : pszUTF8Source = CPLRecode( pszSource, pszSrcEncoding, CPL_ENC_UTF8 );
339 0 : if( pszUTF8Source == NULL )
340 0 : return NULL;
341 : }
342 :
343 : /* -------------------------------------------------------------------- */
344 : /* We try to avoid changes of character set. We are just */
345 : /* providing for unicode to unicode. */
346 : /* -------------------------------------------------------------------- */
347 0 : if( strcmp(pszDstEncoding,CPL_ENC_UCS2) != 0
348 : && strcmp(pszDstEncoding,CPL_ENC_UCS4) != 0
349 : && strcmp(pszDstEncoding,CPL_ENC_UTF16) != 0 )
350 : {
351 : CPLError( CE_Failure, CPLE_AppDefined,
352 : "Stub recoding implementation does not support\n"
353 : "CPLRecodeToWChar(...,%s,%s)",
354 0 : pszSrcEncoding, pszDstEncoding );
355 0 : return NULL;
356 : }
357 :
358 : /* -------------------------------------------------------------------- */
359 : /* Do the UTF-8 to UCS-2 recoding. */
360 : /* -------------------------------------------------------------------- */
361 0 : int nSrcLen = strlen(pszUTF8Source);
362 0 : wchar_t *pwszResult = (wchar_t *) CPLCalloc(sizeof(wchar_t),nSrcLen+1);
363 :
364 0 : utf8towc( pszUTF8Source, nSrcLen, pwszResult, nSrcLen+1 );
365 :
366 0 : if( pszUTF8Source != pszSource )
367 0 : CPLFree( pszUTF8Source );
368 :
369 0 : return pwszResult;
370 : }
371 :
372 :
373 : /************************************************************************/
374 : /* CPLIsUTF8() */
375 : /************************************************************************/
376 :
377 : /**
378 : * Test if a string is encoded as UTF-8.
379 : *
380 : * @param pabyData input string to test
381 : * @param nLen length of the input string, or -1 if the function must compute
382 : * the string length. In which case it must be null terminated.
383 : * @return TRUE if the string is encoded as UTF-8. FALSE otherwise
384 : *
385 : * @since GDAL 1.7.0
386 : */
387 94 : int CPLIsUTF8(const char* pabyData, int nLen)
388 : {
389 94 : if (nLen < 0)
390 94 : nLen = strlen(pabyData);
391 94 : return utf8test(pabyData, (unsigned)nLen) != 0;
392 : }
393 :
394 : /************************************************************************/
395 : /* CPLForceToASCII() */
396 : /************************************************************************/
397 :
398 : /**
399 : * Return a new string that is made only of ASCII characters. If non-ASCII
400 : * characters are found in the input string, they will be replaced by the
401 : * provided replacement character.
402 : *
403 : * @param pabyData input string to test
404 : * @param nLen length of the input string, or -1 if the function must compute
405 : * the string length. In which case it must be null terminated.
406 : * @param chReplacementChar character which will be used when the input stream
407 : * contains a non ASCII character. Must be valid ASCII !
408 : *
409 : * @return a new string that must be freed with CPLFree().
410 : *
411 : * @since GDAL 1.7.0
412 : */
413 1 : char CPL_DLL *CPLForceToASCII(const char* pabyData, int nLen, char chReplacementChar)
414 : {
415 1 : if (nLen < 0)
416 1 : nLen = strlen(pabyData);
417 1 : char* pszOutputString = (char*)CPLMalloc(nLen + 1);
418 : int i;
419 5 : for(i=0;i<nLen;i++)
420 : {
421 4 : if (((unsigned char*)pabyData)[i] > 127)
422 1 : pszOutputString[i] = chReplacementChar;
423 : else
424 3 : pszOutputString[i] = pabyData[i];
425 : }
426 1 : pszOutputString[i] = '\0';
427 1 : return pszOutputString;
428 : }
429 :
430 :
431 : /************************************************************************/
432 : /* ==================================================================== */
433 : /* UTF.C code from FLTK with some modifications. */
434 : /* ==================================================================== */
435 : /************************************************************************/
436 :
437 : /* Set to 1 to turn bad UTF8 bytes into ISO-8859-1. If this is to zero
438 : they are instead turned into the Unicode REPLACEMENT CHARACTER, of
439 : value 0xfffd.
440 : If this is on utf8decode will correctly map most (perhaps all)
441 : human-readable text that is in ISO-8859-1. This may allow you
442 : to completely ignore character sets in your code because virtually
443 : everything is either ISO-8859-1 or UTF-8.
444 : */
445 : #define ERRORS_TO_ISO8859_1 1
446 :
447 : /* Set to 1 to turn bad UTF8 bytes in the 0x80-0x9f range into the
448 : Unicode index for Microsoft's CP1252 character set. You should
449 : also set ERRORS_TO_ISO8859_1. With this a huge amount of more
450 : available text (such as all web pages) are correctly converted
451 : to Unicode.
452 : */
453 : #define ERRORS_TO_CP1252 1
454 :
455 : /* A number of Unicode code points are in fact illegal and should not
456 : be produced by a UTF-8 converter. Turn this on will replace the
457 : bytes in those encodings with errors. If you do this then converting
458 : arbitrary 16-bit data to UTF-8 and then back is not an identity,
459 : which will probably break a lot of software.
460 : */
461 : #define STRICT_RFC3629 0
462 :
463 : #if ERRORS_TO_CP1252
464 : // Codes 0x80..0x9f from the Microsoft CP1252 character set, translated
465 : // to Unicode:
466 : static unsigned short cp1252[32] = {
467 : 0x20ac, 0x0081, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,
468 : 0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008d, 0x017d, 0x008f,
469 : 0x0090, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
470 : 0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x009d, 0x017e, 0x0178
471 : };
472 : #endif
473 :
474 : /************************************************************************/
475 : /* utf8decode() */
476 : /************************************************************************/
477 :
478 : /*
479 : Decode a single UTF-8 encoded character starting at \e p. The
480 : resulting Unicode value (in the range 0-0x10ffff) is returned,
481 : and \e len is set the the number of bytes in the UTF-8 encoding
482 : (adding \e len to \e p will point at the next character).
483 :
484 : If \a p points at an illegal UTF-8 encoding, including one that
485 : would go past \e end, or where a code is uses more bytes than
486 : necessary, then *(unsigned char*)p is translated as though it is
487 : in the Microsoft CP1252 character set and \e len is set to 1.
488 : Treating errors this way allows this to decode almost any
489 : ISO-8859-1 or CP1252 text that has been mistakenly placed where
490 : UTF-8 is expected, and has proven very useful.
491 :
492 : If you want errors to be converted to error characters (as the
493 : standards recommend), adding a test to see if the length is
494 : unexpectedly 1 will work:
495 :
496 : \code
497 : if (*p & 0x80) { // what should be a multibyte encoding
498 : code = utf8decode(p,end,&len);
499 : if (len<2) code = 0xFFFD; // Turn errors into REPLACEMENT CHARACTER
500 : } else { // handle the 1-byte utf8 encoding:
501 : code = *p;
502 : len = 1;
503 : }
504 : \endcode
505 :
506 : Direct testing for the 1-byte case (as shown above) will also
507 : speed up the scanning of strings where the majority of characters
508 : are ASCII.
509 : */
510 1 : static unsigned utf8decode(const char* p, const char* end, int* len)
511 : {
512 1 : unsigned char c = *(unsigned char*)p;
513 1 : if (c < 0x80) {
514 0 : *len = 1;
515 0 : return c;
516 : #if ERRORS_TO_CP1252
517 1 : } else if (c < 0xa0) {
518 1 : *len = 1;
519 1 : return cp1252[c-0x80];
520 : #endif
521 0 : } else if (c < 0xc2) {
522 0 : goto FAIL;
523 : }
524 0 : if (p+1 >= end || (p[1]&0xc0) != 0x80) goto FAIL;
525 0 : if (c < 0xe0) {
526 0 : *len = 2;
527 : return
528 0 : ((p[0] & 0x1f) << 6) +
529 0 : ((p[1] & 0x3f));
530 0 : } else if (c == 0xe0) {
531 0 : if (((unsigned char*)p)[1] < 0xa0) goto FAIL;
532 0 : goto UTF8_3;
533 : #if STRICT_RFC3629
534 : } else if (c == 0xed) {
535 : // RFC 3629 says surrogate chars are illegal.
536 : if (((unsigned char*)p)[1] >= 0xa0) goto FAIL;
537 : goto UTF8_3;
538 : } else if (c == 0xef) {
539 : // 0xfffe and 0xffff are also illegal characters
540 : if (((unsigned char*)p)[1]==0xbf &&
541 : ((unsigned char*)p)[2]>=0xbe) goto FAIL;
542 : goto UTF8_3;
543 : #endif
544 0 : } else if (c < 0xf0) {
545 : UTF8_3:
546 0 : if (p+2 >= end || (p[2]&0xc0) != 0x80) goto FAIL;
547 0 : *len = 3;
548 : return
549 0 : ((p[0] & 0x0f) << 12) +
550 0 : ((p[1] & 0x3f) << 6) +
551 0 : ((p[2] & 0x3f));
552 0 : } else if (c == 0xf0) {
553 0 : if (((unsigned char*)p)[1] < 0x90) goto FAIL;
554 0 : goto UTF8_4;
555 0 : } else if (c < 0xf4) {
556 : UTF8_4:
557 0 : if (p+3 >= end || (p[2]&0xc0) != 0x80 || (p[3]&0xc0) != 0x80) goto FAIL;
558 0 : *len = 4;
559 : #if STRICT_RFC3629
560 : // RFC 3629 says all codes ending in fffe or ffff are illegal:
561 : if ((p[1]&0xf)==0xf &&
562 : ((unsigned char*)p)[2] == 0xbf &&
563 : ((unsigned char*)p)[3] >= 0xbe) goto FAIL;
564 : #endif
565 : return
566 0 : ((p[0] & 0x07) << 18) +
567 0 : ((p[1] & 0x3f) << 12) +
568 0 : ((p[2] & 0x3f) << 6) +
569 0 : ((p[3] & 0x3f));
570 0 : } else if (c == 0xf4) {
571 0 : if (((unsigned char*)p)[1] > 0x8f) goto FAIL; // after 0x10ffff
572 0 : goto UTF8_4;
573 : } else {
574 : FAIL:
575 0 : *len = 1;
576 : #if ERRORS_TO_ISO8859_1
577 0 : return c;
578 : #else
579 : return 0xfffd; // Unicode REPLACEMENT CHARACTER
580 : #endif
581 : }
582 : }
583 :
584 : /************************************************************************/
585 : /* utf8fwd() */
586 : /************************************************************************/
587 :
588 : /*
589 : Move \a p forward until it points to the start of a UTF-8
590 : character. If it already points at the start of one then it
591 : is returned unchanged. Any UTF-8 errors are treated as though each
592 : byte of the error is an individual character.
593 :
594 : \e start is the start of the string and is used to limit the
595 : backwards search for the start of a utf8 character.
596 :
597 : \e end is the end of the string and is assummed to be a break
598 : between characters. It is assummed to be greater than p.
599 :
600 : This function is for moving a pointer that was jumped to the
601 : middle of a string, such as when doing a binary search for
602 : a position. You should use either this or utf8back() depending
603 : on which direction your algorithim can handle the pointer
604 : moving. Do not use this to scan strings, use utf8decode()
605 : instead.
606 : */
607 :
608 : #ifdef FUTURE_NEEDS
609 : static const char* utf8fwd(const char* p, const char* start, const char* end)
610 : {
611 : const char* a;
612 : int len;
613 : // if we are not pointing at a continuation character, we are done:
614 : if ((*p&0xc0) != 0x80) return p;
615 : // search backwards for a 0xc0 starting the character:
616 : for (a = p-1; ; --a) {
617 : if (a < start) return p;
618 : if (!(a[0]&0x80)) return p;
619 : if ((a[0]&0x40)) break;
620 : }
621 : utf8decode(a,end,&len);
622 : a += len;
623 : if (a > p) return a;
624 : return p;
625 : }
626 : #endif /* def FUTURE_NEEDS */
627 :
628 : /************************************************************************/
629 : /* utf8back() */
630 : /************************************************************************/
631 :
632 : /*
633 : Move \a p backward until it points to the start of a UTF-8
634 : character. If it already points at the start of one then it
635 : is returned unchanged. Any UTF-8 errors are treated as though each
636 : byte of the error is an individual character.
637 :
638 : \e start is the start of the string and is used to limit the
639 : backwards search for the start of a UTF-8 character.
640 :
641 : \e end is the end of the string and is assummed to be a break
642 : between characters. It is assummed to be greater than p.
643 :
644 : If you wish to decrement a UTF-8 pointer, pass p-1 to this.
645 : */
646 :
647 : #ifdef FUTURE_NEEDS
648 : static const char* utf8back(const char* p, const char* start, const char* end)
649 : {
650 : const char* a;
651 : int len;
652 : // if we are not pointing at a continuation character, we are done:
653 : if ((*p&0xc0) != 0x80) return p;
654 : // search backwards for a 0xc0 starting the character:
655 : for (a = p-1; ; --a) {
656 : if (a < start) return p;
657 : if (!(a[0]&0x80)) return p;
658 : if ((a[0]&0x40)) break;
659 : }
660 : utf8decode(a,end,&len);
661 : if (a+len > p) return a;
662 : return p;
663 : }
664 : #endif /* def FUTURE_NEEDS */
665 :
666 : /************************************************************************/
667 : /* utf8bytes() */
668 : /************************************************************************/
669 :
670 : /* Returns number of bytes that utf8encode() will use to encode the
671 : character \a ucs. */
672 : #ifdef FUTURE_NEEDS
673 : static int utf8bytes(unsigned ucs) {
674 : if (ucs < 0x000080U) {
675 : return 1;
676 : } else if (ucs < 0x000800U) {
677 : return 2;
678 : } else if (ucs < 0x010000U) {
679 : return 3;
680 : } else if (ucs < 0x10ffffU) {
681 : return 4;
682 : } else {
683 : return 3; // length of the illegal character encoding
684 : }
685 : }
686 : #endif /* def FUTURE_NEEDS */
687 :
688 : /************************************************************************/
689 : /* utf8encode() */
690 : /************************************************************************/
691 :
692 : /* Write the UTF-8 encoding of \e ucs into \e buf and return the
693 : number of bytes written. Up to 4 bytes may be written. If you know
694 : that \a ucs is less than 0x10000 then at most 3 bytes will be written.
695 : If you wish to speed this up, remember that anything less than 0x80
696 : is written as a single byte.
697 :
698 : If ucs is greater than 0x10ffff this is an illegal character
699 : according to RFC 3629. These are converted as though they are
700 : 0xFFFD (REPLACEMENT CHARACTER).
701 :
702 : RFC 3629 also says many other values for \a ucs are illegal (in
703 : the range 0xd800 to 0xdfff, or ending with 0xfffe or
704 : 0xffff). However I encode these as though they are legal, so that
705 : utf8encode/utf8decode will be the identity for all codes between 0
706 : and 0x10ffff.
707 : */
708 : #ifdef FUTURE_NEEDS
709 : static int utf8encode(unsigned ucs, char* buf) {
710 : if (ucs < 0x000080U) {
711 : buf[0] = ucs;
712 : return 1;
713 : } else if (ucs < 0x000800U) {
714 : buf[0] = 0xc0 | (ucs >> 6);
715 : buf[1] = 0x80 | (ucs & 0x3F);
716 : return 2;
717 : } else if (ucs < 0x010000U) {
718 : buf[0] = 0xe0 | (ucs >> 12);
719 : buf[1] = 0x80 | ((ucs >> 6) & 0x3F);
720 : buf[2] = 0x80 | (ucs & 0x3F);
721 : return 3;
722 : } else if (ucs < 0x0010ffffU) {
723 : buf[0] = 0xf0 | (ucs >> 18);
724 : buf[1] = 0x80 | ((ucs >> 12) & 0x3F);
725 : buf[2] = 0x80 | ((ucs >> 6) & 0x3F);
726 : buf[3] = 0x80 | (ucs & 0x3F);
727 : return 4;
728 : } else {
729 : // encode 0xfffd:
730 : buf[0] = 0xefU;
731 : buf[1] = 0xbfU;
732 : buf[2] = 0xbdU;
733 : return 3;
734 : }
735 : }
736 : #endif /* def FUTURE_NEEDS */
737 :
738 : /************************************************************************/
739 : /* utf8towc() */
740 : /************************************************************************/
741 :
742 : /* Convert a UTF-8 sequence into an array of wchar_t. These
743 : are used by some system calls, especially on Windows.
744 :
745 : \a src points at the UTF-8, and \a srclen is the number of bytes to
746 : convert.
747 :
748 : \a dst points at an array to write, and \a dstlen is the number of
749 : locations in this array. At most \a dstlen-1 words will be
750 : written there, plus a 0 terminating word. Thus this function
751 : will never overwrite the buffer and will always return a
752 : zero-terminated string. If \a dstlen is zero then \a dst can be
753 : null and no data is written, but the length is returned.
754 :
755 : The return value is the number of words that \e would be written
756 : to \a dst if it were long enough, not counting the terminating
757 : zero. If the return value is greater or equal to \a dstlen it
758 : indicates truncation, you can then allocate a new array of size
759 : return+1 and call this again.
760 :
761 : Errors in the UTF-8 are converted as though each byte in the
762 : erroneous string is in the Microsoft CP1252 encoding. This allows
763 : ISO-8859-1 text mistakenly identified as UTF-8 to be printed
764 : correctly.
765 :
766 : Notice that sizeof(wchar_t) is 2 on Windows and is 4 on Linux
767 : and most other systems. Where wchar_t is 16 bits, Unicode
768 : characters in the range 0x10000 to 0x10ffff are converted to
769 : "surrogate pairs" which take two words each (this is called UTF-16
770 : encoding). If wchar_t is 32 bits this rather nasty problem is
771 : avoided.
772 : */
773 0 : static unsigned utf8towc(const char* src, unsigned srclen,
774 : wchar_t* dst, unsigned dstlen)
775 : {
776 0 : const char* p = src;
777 0 : const char* e = src+srclen;
778 0 : unsigned count = 0;
779 0 : if (dstlen) for (;;) {
780 0 : if (p >= e) {dst[count] = 0; return count;}
781 0 : if (!(*p & 0x80)) { // ascii
782 0 : dst[count] = *p++;
783 : } else {
784 0 : int len; unsigned ucs = utf8decode(p,e,&len);
785 0 : p += len;
786 : #ifdef _WIN32
787 : if (ucs < 0x10000) {
788 : dst[count] = (wchar_t)ucs;
789 : } else {
790 : // make a surrogate pair:
791 : if (count+2 >= dstlen) {dst[count] = 0; count += 2; break;}
792 : dst[count] = (wchar_t)((((ucs-0x10000u)>>10)&0x3ff) | 0xd800);
793 : dst[++count] = (wchar_t)((ucs&0x3ff) | 0xdc00);
794 : }
795 : #else
796 0 : dst[count] = (wchar_t)ucs;
797 : #endif
798 : }
799 0 : if (++count == dstlen) {dst[count-1] = 0; break;}
800 : }
801 : // we filled dst, measure the rest:
802 0 : while (p < e) {
803 0 : if (!(*p & 0x80)) p++;
804 : else {
805 : #ifdef _WIN32
806 : int len; unsigned ucs = utf8decode(p,e,&len);
807 : p += len;
808 : if (ucs >= 0x10000) ++count;
809 : #else
810 0 : int len; utf8decode(p,e,&len);
811 0 : p += len;
812 : #endif
813 : }
814 0 : ++count;
815 : }
816 0 : return count;
817 : }
818 :
819 : /************************************************************************/
820 : /* utf8toa() */
821 : /************************************************************************/
822 : /* Convert a UTF-8 sequence into an array of 1-byte characters.
823 :
824 : If the UTF-8 decodes to a character greater than 0xff then it is
825 : replaced with '?'.
826 :
827 : Errors in the UTF-8 are converted as individual bytes, same as
828 : utf8decode() does. This allows ISO-8859-1 text mistakenly identified
829 : as UTF-8 to be printed correctly (and possibly CP1512 on Windows).
830 :
831 : \a src points at the UTF-8, and \a srclen is the number of bytes to
832 : convert.
833 :
834 : Up to \a dstlen bytes are written to \a dst, including a null
835 : terminator. The return value is the number of bytes that would be
836 : written, not counting the null terminator. If greater or equal to
837 : \a dstlen then if you malloc a new array of size n+1 you will have
838 : the space needed for the entire string. If \a dstlen is zero then
839 : nothing is written and this call just measures the storage space
840 : needed.
841 : */
842 0 : static unsigned utf8toa(const char* src, unsigned srclen,
843 : char* dst, unsigned dstlen)
844 : {
845 0 : const char* p = src;
846 0 : const char* e = src+srclen;
847 0 : unsigned count = 0;
848 0 : if (dstlen) for (;;) {
849 : unsigned char c;
850 0 : if (p >= e) {dst[count] = 0; return count;}
851 0 : c = *(unsigned char*)p;
852 0 : if (c < 0xC2) { // ascii or bad code
853 0 : dst[count] = c;
854 0 : p++;
855 : } else {
856 0 : int len; unsigned ucs = utf8decode(p,e,&len);
857 0 : p += len;
858 0 : if (ucs < 0x100) dst[count] = (char)ucs;
859 0 : else dst[count] = '?';
860 : }
861 0 : if (++count >= dstlen) {dst[count-1] = 0; break;}
862 : }
863 : // we filled dst, measure the rest:
864 0 : while (p < e) {
865 0 : if (!(*p & 0x80)) p++;
866 : else {
867 : int len;
868 0 : utf8decode(p,e,&len);
869 0 : p += len;
870 : }
871 0 : ++count;
872 : }
873 0 : return count;
874 : }
875 :
876 : /************************************************************************/
877 : /* utf8fromwc() */
878 : /************************************************************************/
879 : /* Turn "wide characters" as returned by some system calls
880 : (especially on Windows) into UTF-8.
881 :
882 : Up to \a dstlen bytes are written to \a dst, including a null
883 : terminator. The return value is the number of bytes that would be
884 : written, not counting the null terminator. If greater or equal to
885 : \a dstlen then if you malloc a new array of size n+1 you will have
886 : the space needed for the entire string. If \a dstlen is zero then
887 : nothing is written and this call just measures the storage space
888 : needed.
889 :
890 : \a srclen is the number of words in \a src to convert. On Windows
891 : this is not necessairly the number of characters, due to there
892 : possibly being "surrogate pairs" in the UTF-16 encoding used.
893 : On Unix wchar_t is 32 bits and each location is a character.
894 :
895 : On Unix if a src word is greater than 0x10ffff then this is an
896 : illegal character according to RFC 3629. These are converted as
897 : though they are 0xFFFD (REPLACEMENT CHARACTER). Characters in the
898 : range 0xd800 to 0xdfff, or ending with 0xfffe or 0xffff are also
899 : illegal according to RFC 3629. However I encode these as though
900 : they are legal, so that utf8towc will return the original data.
901 :
902 : On Windows "surrogate pairs" are converted to a single character
903 : and UTF-8 encoded (as 4 bytes). Mismatched halves of surrogate
904 : pairs are converted as though they are individual characters.
905 : */
906 0 : static unsigned utf8fromwc(char* dst, unsigned dstlen,
907 : const wchar_t* src, unsigned srclen) {
908 0 : unsigned i = 0;
909 0 : unsigned count = 0;
910 0 : if (dstlen) for (;;) {
911 : unsigned ucs;
912 0 : if (i >= srclen) {dst[count] = 0; return count;}
913 0 : ucs = src[i++];
914 0 : if (ucs < 0x80U) {
915 0 : dst[count++] = (char)ucs;
916 0 : if (count >= dstlen) {dst[count-1] = 0; break;}
917 0 : } else if (ucs < 0x800U) { // 2 bytes
918 0 : if (count+2 >= dstlen) {dst[count] = 0; count += 2; break;}
919 0 : dst[count++] = 0xc0 | (char)(ucs >> 6);
920 0 : dst[count++] = 0x80 | (char)(ucs & 0x3F);
921 : #ifdef _WIN32
922 : } else if (ucs >= 0xd800 && ucs <= 0xdbff && i < srclen &&
923 : src[i] >= 0xdc00 && src[i] <= 0xdfff) {
924 : // surrogate pair
925 : unsigned ucs2 = src[i++];
926 : ucs = 0x10000U + ((ucs&0x3ff)<<10) + (ucs2&0x3ff);
927 : // all surrogate pairs turn into 4-byte utf8
928 : #else
929 0 : } else if (ucs >= 0x10000) {
930 0 : if (ucs > 0x10ffff) {
931 0 : ucs = 0xfffd;
932 0 : goto J1;
933 : }
934 : #endif
935 0 : if (count+4 >= dstlen) {dst[count] = 0; count += 4; break;}
936 0 : dst[count++] = 0xf0 | (char)(ucs >> 18);
937 0 : dst[count++] = 0x80 | (char)((ucs >> 12) & 0x3F);
938 0 : dst[count++] = 0x80 | (char)((ucs >> 6) & 0x3F);
939 0 : dst[count++] = 0x80 | (char)(ucs & 0x3F);
940 : } else {
941 : #ifndef _WIN32
942 : J1:
943 : #endif
944 : // all others are 3 bytes:
945 0 : if (count+3 >= dstlen) {dst[count] = 0; count += 3; break;}
946 0 : dst[count++] = 0xe0 | (char)(ucs >> 12);
947 0 : dst[count++] = 0x80 | (char)((ucs >> 6) & 0x3F);
948 0 : dst[count++] = 0x80 | (char)(ucs & 0x3F);
949 : }
950 : }
951 : // we filled dst, measure the rest:
952 0 : while (i < srclen) {
953 0 : unsigned ucs = src[i++];
954 0 : if (ucs < 0x80U) {
955 0 : count++;
956 0 : } else if (ucs < 0x800U) { // 2 bytes
957 0 : count += 2;
958 : #ifdef _WIN32
959 : } else if (ucs >= 0xd800 && ucs <= 0xdbff && i < srclen-1 &&
960 : src[i+1] >= 0xdc00 && src[i+1] <= 0xdfff) {
961 : // surrogate pair
962 : ++i;
963 : #else
964 0 : } else if (ucs >= 0x10000 && ucs <= 0x10ffff) {
965 : #endif
966 0 : count += 4;
967 : } else {
968 0 : count += 3;
969 : }
970 : }
971 0 : return count;
972 : }
973 :
974 :
975 : /************************************************************************/
976 : /* utf8froma() */
977 : /************************************************************************/
978 :
979 : /* Convert an ISO-8859-1 (ie normal c-string) byte stream to UTF-8.
980 :
981 : It is possible this should convert Microsoft's CP1252 to UTF-8
982 : instead. This would translate the codes in the range 0x80-0x9f
983 : to different characters. Currently it does not do this.
984 :
985 : Up to \a dstlen bytes are written to \a dst, including a null
986 : terminator. The return value is the number of bytes that would be
987 : written, not counting the null terminator. If greater or equal to
988 : \a dstlen then if you malloc a new array of size n+1 you will have
989 : the space needed for the entire string. If \a dstlen is zero then
990 : nothing is written and this call just measures the storage space
991 : needed.
992 :
993 : \a srclen is the number of bytes in \a src to convert.
994 :
995 : If the return value equals \a srclen then this indicates that
996 : no conversion is necessary, as only ASCII characters are in the
997 : string.
998 : */
999 0 : static unsigned utf8froma(char* dst, unsigned dstlen,
1000 : const char* src, unsigned srclen) {
1001 0 : const char* p = src;
1002 0 : const char* e = src+srclen;
1003 0 : unsigned count = 0;
1004 0 : if (dstlen) for (;;) {
1005 : unsigned char ucs;
1006 0 : if (p >= e) {dst[count] = 0; return count;}
1007 0 : ucs = *(unsigned char*)p++;
1008 0 : if (ucs < 0x80U) {
1009 0 : dst[count++] = ucs;
1010 0 : if (count >= dstlen) {dst[count-1] = 0; break;}
1011 : } else { // 2 bytes (note that CP1252 translate could make 3 bytes!)
1012 0 : if (count+2 >= dstlen) {dst[count] = 0; count += 2; break;}
1013 0 : dst[count++] = 0xc0 | (ucs >> 6);
1014 0 : dst[count++] = 0x80 | (ucs & 0x3F);
1015 : }
1016 : }
1017 : // we filled dst, measure the rest:
1018 0 : while (p < e) {
1019 0 : unsigned char ucs = *(unsigned char*)p++;
1020 0 : if (ucs < 0x80U) {
1021 0 : count++;
1022 : } else {
1023 0 : count += 2;
1024 : }
1025 : }
1026 0 : return count;
1027 : }
1028 :
1029 : /*
1030 : ** For now we disable the rest which is locale() related. We may need
1031 : ** parts of it later.
1032 : */
1033 :
1034 : #ifdef notdef
1035 :
1036 : #ifdef _WIN32
1037 : # include <windows.h>
1038 : #endif
1039 :
1040 : /*! Return true if the "locale" seems to indicate that UTF-8 encoding
1041 : is used. If true the utf8tomb and utf8frommb don't do anything
1042 : useful.
1043 :
1044 : <i>It is highly recommended that you change your system so this
1045 : does return true.</i> On Windows this is done by setting the
1046 : "codepage" to CP_UTF8. On Unix this is done by setting $LC_CTYPE
1047 : to a string containing the letters "utf" or "UTF" in it, or by
1048 : deleting all $LC* and $LANG environment variables. In the future
1049 : it is likely that all non-Asian Unix systems will return true,
1050 : due to the compatability of UTF-8 with ISO-8859-1.
1051 : */
1052 : int utf8locale(void) {
1053 : static int ret = 2;
1054 : if (ret == 2) {
1055 : #ifdef _WIN32
1056 : ret = GetACP() == CP_UTF8;
1057 : #else
1058 : char* s;
1059 : ret = 1; // assumme UTF-8 if no locale
1060 : if (((s = getenv("LC_CTYPE")) && *s) ||
1061 : ((s = getenv("LC_ALL")) && *s) ||
1062 : ((s = getenv("LANG")) && *s)) {
1063 : ret = (strstr(s,"utf") || strstr(s,"UTF"));
1064 : }
1065 : #endif
1066 : }
1067 : return ret;
1068 : }
1069 :
1070 : /*! Convert the UTF-8 used by FLTK to the locale-specific encoding
1071 : used for filenames (and sometimes used for data in files).
1072 : Unfortunatley due to stupid design you will have to do this as
1073 : needed for filenames. This is a bug on both Unix and Windows.
1074 :
1075 : Up to \a dstlen bytes are written to \a dst, including a null
1076 : terminator. The return value is the number of bytes that would be
1077 : written, not counting the null terminator. If greater or equal to
1078 : \a dstlen then if you malloc a new array of size n+1 you will have
1079 : the space needed for the entire string. If \a dstlen is zero then
1080 : nothing is written and this call just measures the storage space
1081 : needed.
1082 :
1083 : If utf8locale() returns true then this does not change the data.
1084 : It is copied and truncated as necessary to
1085 : the destination buffer and \a srclen is always returned. */
1086 : unsigned utf8tomb(const char* src, unsigned srclen,
1087 : char* dst, unsigned dstlen)
1088 : {
1089 : if (!utf8locale()) {
1090 : #ifdef _WIN32
1091 : wchar_t lbuf[1024];
1092 : wchar_t* buf = lbuf;
1093 : unsigned length = utf8towc(src, srclen, buf, 1024);
1094 : unsigned ret;
1095 : if (length >= 1024) {
1096 : buf = (wchar_t*)(malloc((length+1)*sizeof(wchar_t)));
1097 : utf8towc(src, srclen, buf, length+1);
1098 : }
1099 : if (dstlen) {
1100 : // apparently this does not null-terminate, even though msdn
1101 : // documentation claims it does:
1102 : ret =
1103 : WideCharToMultiByte(GetACP(), 0, buf, length, dst, dstlen, 0, 0);
1104 : dst[ret] = 0;
1105 : }
1106 : // if it overflows or measuring length, get the actual length:
1107 : if (dstlen==0 || ret >= dstlen-1)
1108 : ret =
1109 : WideCharToMultiByte(GetACP(), 0, buf, length, 0, 0, 0, 0);
1110 : if (buf != lbuf) free((void*)buf);
1111 : return ret;
1112 : #else
1113 : wchar_t lbuf[1024];
1114 : wchar_t* buf = lbuf;
1115 : unsigned length = utf8towc(src, srclen, buf, 1024);
1116 : int ret;
1117 : if (length >= 1024) {
1118 : buf = (wchar_t*)(malloc((length+1)*sizeof(wchar_t)));
1119 : utf8towc(src, srclen, buf, length+1);
1120 : }
1121 : if (dstlen) {
1122 : ret = wcstombs(dst, buf, dstlen);
1123 : if (ret >= dstlen-1) ret = wcstombs(0,buf,0);
1124 : } else {
1125 : ret = wcstombs(0,buf,0);
1126 : }
1127 : if (buf != lbuf) free((void*)buf);
1128 : if (ret >= 0) return (unsigned)ret;
1129 : // on any errors we return the UTF-8 as raw text...
1130 : #endif
1131 : }
1132 : // identity transform:
1133 : if (srclen < dstlen) {
1134 : memcpy(dst, src, srclen);
1135 : dst[srclen] = 0;
1136 : } else {
1137 : memcpy(dst, src, dstlen-1);
1138 : dst[dstlen-1] = 0;
1139 : }
1140 : return srclen;
1141 : }
1142 :
1143 : /*! Convert a filename from the locale-specific multibyte encoding
1144 : used by Windows to UTF-8 as used by FLTK.
1145 :
1146 : Up to \a dstlen bytes are written to \a dst, including a null
1147 : terminator. The return value is the number of bytes that would be
1148 : written, not counting the null terminator. If greater or equal to
1149 : \a dstlen then if you malloc a new array of size n+1 you will have
1150 : the space needed for the entire string. If \a dstlen is zero then
1151 : nothing is written and this call just measures the storage space
1152 : needed.
1153 :
1154 : On Unix or on Windows when a UTF-8 locale is in effect, this
1155 : does not change the data. It is copied and truncated as necessary to
1156 : the destination buffer and \a srclen is always returned.
1157 : You may also want to check if utf8test() returns non-zero, so that
1158 : the filesystem can store filenames in UTF-8 encoding regardless of
1159 : the locale.
1160 : */
1161 : unsigned utf8frommb(char* dst, unsigned dstlen,
1162 : const char* src, unsigned srclen)
1163 : {
1164 : if (!utf8locale()) {
1165 : #ifdef _WIN32
1166 : wchar_t lbuf[1024];
1167 : wchar_t* buf = lbuf;
1168 : unsigned length;
1169 : unsigned ret;
1170 : length =
1171 : MultiByteToWideChar(GetACP(), 0, src, srclen, buf, 1024);
1172 : if (length >= 1024) {
1173 : length = MultiByteToWideChar(GetACP(), 0, src, srclen, 0, 0);
1174 : buf = (wchar_t*)(malloc(length*sizeof(wchar_t)));
1175 : MultiByteToWideChar(GetACP(), 0, src, srclen, buf, length);
1176 : }
1177 : ret = utf8fromwc(dst, dstlen, buf, length);
1178 : if (buf != lbuf) free((void*)buf);
1179 : return ret;
1180 : #else
1181 : wchar_t lbuf[1024];
1182 : wchar_t* buf = lbuf;
1183 : int length;
1184 : unsigned ret;
1185 : length = mbstowcs(buf, src, 1024);
1186 : if (length >= 1024) {
1187 : length = mbstowcs(0, src, 0)+1;
1188 : buf = (wchar_t*)(malloc(length*sizeof(unsigned short)));
1189 : mbstowcs(buf, src, length);
1190 : }
1191 : if (length >= 0) {
1192 : ret = utf8fromwc(dst, dstlen, buf, length);
1193 : if (buf != lbuf) free((void*)buf);
1194 : return ret;
1195 : }
1196 : // errors in conversion return the UTF-8 unchanged
1197 : #endif
1198 : }
1199 : // identity transform:
1200 : if (srclen < dstlen) {
1201 : memcpy(dst, src, srclen);
1202 : dst[srclen] = 0;
1203 : } else {
1204 : memcpy(dst, src, dstlen-1);
1205 : dst[dstlen-1] = 0;
1206 : }
1207 : return srclen;
1208 : }
1209 :
1210 : #endif /* def notdef - disabled locale specific stuff */
1211 :
1212 : /*! Examines the first \a srclen bytes in \a src and return a verdict
1213 : on whether it is UTF-8 or not.
1214 : - Returns 0 if there is any illegal UTF-8 sequences, using the
1215 : same rules as utf8decode(). Note that some UCS values considered
1216 : illegal by RFC 3629, such as 0xffff, are considered legal by this.
1217 : - Returns 1 if there are only single-byte characters (ie no bytes
1218 : have the high bit set). This is legal UTF-8, but also indicates
1219 : plain ASCII. It also returns 1 if \a srclen is zero.
1220 : - Returns 2 if there are only characters less than 0x800.
1221 : - Returns 3 if there are only characters less than 0x10000.
1222 : - Returns 4 if there are characters in the 0x10000 to 0x10ffff range.
1223 :
1224 : Because there are many illegal sequences in UTF-8, it is almost
1225 : impossible for a string in another encoding to be confused with
1226 : UTF-8. This is very useful for transitioning Unix to UTF-8
1227 : filenames, you can simply test each filename with this to decide
1228 : if it is UTF-8 or in the locale encoding. My hope is that if
1229 : this is done we will be able to cleanly transition to a locale-less
1230 : encoding.
1231 : */
1232 :
1233 94 : static int utf8test(const char* src, unsigned srclen) {
1234 94 : int ret = 1;
1235 94 : const char* p = src;
1236 94 : const char* e = src+srclen;
1237 1195 : while (p < e) {
1238 1008 : if (*p & 0x80) {
1239 1 : int len; utf8decode(p,e,&len);
1240 1 : if (len < 2) return 0;
1241 0 : if (len > ret) ret = len;
1242 0 : p += len;
1243 : } else {
1244 1007 : p++;
1245 : }
1246 : }
1247 93 : return ret;
1248 : }
1249 :
1250 : #endif /* defined(CPL_RECODE_STUB) */
|