1 : /**********************************************************************
2 : * $Id: cpl_recode_stub.cpp 23024 2011-09-02 19:45:20Z rouault $
3 : *
4 : * Name: cpl_recode_stub.cpp
5 : * Project: CPL - Common Portability Library
6 : * Purpose: Character set recoding and char/wchar_t conversions, stub
7 : * implementation to be used if iconv() functionality is not
8 : * available.
9 : * Author: Frank Warmerdam, warmerdam@pobox.com
10 : *
11 : * The bulk of this code is derived from the utf.c module from FLTK. It
12 : * was originally downloaded from:
13 : * http://svn.easysw.com/public/fltk/fltk/trunk/src/utf.c
14 : *
15 : **********************************************************************
16 : * Copyright (c) 2008, Frank Warmerdam
17 : * Copyright 2006 by Bill Spitzak and others.
18 : *
19 : * Permission to use, copy, modify, and distribute this software for any
20 : * purpose with or without fee is hereby granted, provided that the above
21 : * copyright notice and this permission notice appear in all copies.
22 : *
23 : * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
24 : * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
25 : * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
26 : * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
27 : * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
28 : * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
29 : * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
30 : **********************************************************************/
31 :
32 : #include "cpl_string.h"
33 :
34 : CPL_CVSID("$Id: cpl_recode_stub.cpp 23024 2011-09-02 19:45:20Z rouault $");
35 :
36 : #ifdef CPL_RECODE_STUB
37 :
38 : static unsigned utf8decode(const char* p, const char* end, int* len);
39 : static unsigned utf8towc(const char* src, unsigned srclen,
40 : wchar_t* dst, unsigned dstlen);
41 : static unsigned utf8toa(const char* src, unsigned srclen,
42 : char* dst, unsigned dstlen);
43 : static unsigned utf8fromwc(char* dst, unsigned dstlen,
44 : const wchar_t* src, unsigned srclen);
45 : static unsigned utf8froma(char* dst, unsigned dstlen,
46 : const char* src, unsigned srclen);
47 : static int utf8test(const char* src, unsigned srclen);
48 :
49 : #ifdef FUTURE_NEEDS
50 : static const char* utf8fwd(const char* p, const char* start, const char* end);
51 : static const char* utf8back(const char* p, const char* start, const char*end);
52 : static int utf8encode(unsigned ucs, char* buf);
53 : static int utf8bytes(unsigned ucs);
54 : #endif /* def FUTURE_NEEDS */
55 :
56 : /************************************************************************/
57 : /* ==================================================================== */
58 : /* Stub Implementation not depending on iconv() or WIN32 API. */
59 : /* ==================================================================== */
60 : /************************************************************************/
61 :
62 : /************************************************************************/
63 : /* CPLRecodeStub() */
64 : /************************************************************************/
65 :
66 : /**
67 : * Convert a string from a source encoding to a destination encoding.
68 : *
69 : * The only guaranteed supported encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
70 : * and CPL_ENC_ISO8859_1. Currently, the following conversions are supported :
71 : * <ul>
72 : * <li>CPL_ENC_ASCII -> CPL_ENC_UTF8 or CPL_ENC_ISO8859_1 (no conversion in fact)</li>
73 : * <li>CPL_ENC_ISO8859_1 -> CPL_ENC_UTF8</li>
74 : * <li>CPL_ENC_UTF8 -> CPL_ENC_ISO8859_1</li>
75 : * </ul>
76 : *
77 : * If an error occurs an error may, or may not be posted with CPLError().
78 : *
79 : * @param pszSource a NULL terminated string.
80 : * @param pszSrcEncoding the source encoding.
81 : * @param pszDstEncoding the destination encoding.
82 : *
83 : * @return a NULL terminated string which should be freed with CPLFree().
84 : */
85 :
86 17986 : char *CPLRecodeStub( const char *pszSource,
87 : const char *pszSrcEncoding,
88 : const char *pszDstEncoding )
89 :
90 : {
91 : /* -------------------------------------------------------------------- */
92 : /* If the source or destination is current locale(), we change */
93 : /* it to ISO8859-1 since our stub implementation does not */
94 : /* attempt to address locales properly. */
95 : /* -------------------------------------------------------------------- */
96 :
97 17986 : if( pszSrcEncoding[0] == '\0' )
98 0 : pszSrcEncoding = CPL_ENC_ISO8859_1;
99 :
100 17986 : if( pszDstEncoding[0] == '\0' )
101 0 : pszDstEncoding = CPL_ENC_ISO8859_1;
102 :
103 : /* -------------------------------------------------------------------- */
104 : /* ISO8859 to UTF8 */
105 : /* -------------------------------------------------------------------- */
106 17986 : if( strcmp(pszSrcEncoding,CPL_ENC_ISO8859_1) == 0
107 : && strcmp(pszDstEncoding,CPL_ENC_UTF8) == 0 )
108 : {
109 11312 : int nCharCount = strlen(pszSource);
110 11312 : char *pszResult = (char *) CPLCalloc(1,nCharCount*2+1);
111 :
112 11312 : utf8froma( pszResult, nCharCount*2+1, pszSource, nCharCount );
113 :
114 11312 : return pszResult;
115 : }
116 :
117 : /* -------------------------------------------------------------------- */
118 : /* UTF8 to ISO8859 */
119 : /* -------------------------------------------------------------------- */
120 6674 : if( strcmp(pszSrcEncoding,CPL_ENC_UTF8) == 0
121 : && strcmp(pszDstEncoding,CPL_ENC_ISO8859_1) == 0 )
122 : {
123 6674 : int nCharCount = strlen(pszSource);
124 6674 : char *pszResult = (char *) CPLCalloc(1,nCharCount+1);
125 :
126 6674 : utf8toa( pszSource, nCharCount, pszResult, nCharCount+1 );
127 :
128 6674 : return pszResult;
129 : }
130 :
131 : /* -------------------------------------------------------------------- */
132 : /* Anything else to UTF-8 is treated as ISO8859-1 to UTF-8 with */
133 : /* a one-time warning. */
134 : /* -------------------------------------------------------------------- */
135 0 : if( strcmp(pszDstEncoding,CPL_ENC_UTF8) == 0 )
136 : {
137 0 : int nCharCount = strlen(pszSource);
138 0 : char *pszResult = (char *) CPLCalloc(1,nCharCount*2+1);
139 : static int bHaveWarned = FALSE;
140 :
141 0 : if( !bHaveWarned )
142 : {
143 0 : bHaveWarned = 1;
144 : CPLError( CE_Warning, CPLE_AppDefined,
145 : "Recode from %s to UTF-8 not supported, treated as ISO8859-1 to UTF-8.",
146 0 : pszSrcEncoding );
147 : }
148 :
149 0 : utf8froma( pszResult, nCharCount*2+1, pszSource, nCharCount );
150 :
151 0 : return pszResult;
152 : }
153 :
154 : /* -------------------------------------------------------------------- */
155 : /* UTF-8 to anything else is treated as UTF-8 to ISO-8859-1 */
156 : /* with a warning. */
157 : /* -------------------------------------------------------------------- */
158 0 : if( strcmp(pszSrcEncoding,CPL_ENC_UTF8) == 0
159 : && strcmp(pszDstEncoding,CPL_ENC_ISO8859_1) == 0 )
160 : {
161 0 : int nCharCount = strlen(pszSource);
162 0 : char *pszResult = (char *) CPLCalloc(1,nCharCount+1);
163 : static int bHaveWarned = FALSE;
164 :
165 0 : if( !bHaveWarned )
166 : {
167 0 : bHaveWarned = 1;
168 : CPLError( CE_Warning, CPLE_AppDefined,
169 : "Recode from UTF-8 to %s not supported, treated as UTF-8 to ISO8859-1.",
170 0 : pszDstEncoding );
171 : }
172 :
173 0 : utf8toa( pszSource, nCharCount, pszResult, nCharCount+1 );
174 :
175 0 : return pszResult;
176 : }
177 :
178 : /* -------------------------------------------------------------------- */
179 : /* Everything else is treated as a no-op with a warning. */
180 : /* -------------------------------------------------------------------- */
181 : {
182 : static int bHaveWarned = FALSE;
183 :
184 0 : if( !bHaveWarned )
185 : {
186 0 : bHaveWarned = 1;
187 : CPLError( CE_Warning, CPLE_AppDefined,
188 : "Recode from %s to %s not supported, no change applied.",
189 0 : pszSrcEncoding, pszDstEncoding );
190 : }
191 :
192 0 : return CPLStrdup(pszSource);
193 : }
194 : }
195 :
196 : /************************************************************************/
197 : /* CPLRecodeFromWCharStub() */
198 : /************************************************************************/
199 :
200 : /**
201 : * Convert wchar_t string to UTF-8.
202 : *
203 : * Convert a wchar_t string into a multibyte utf-8 string. The only
204 : * guaranteed supported source encoding is CPL_ENC_UCS2, and the only
205 : * guaranteed supported destination encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
206 : * and CPL_ENC_ISO8859_1. In some cases (ie. using iconv()) other encodings
207 : * may also be supported.
208 : *
209 : * Note that the wchar_t type varies in size on different systems. On
210 : * win32 it is normally 2 bytes, and on unix 4 bytes.
211 : *
212 : * If an error occurs an error may, or may not be posted with CPLError().
213 : *
214 : * @param pwszSource the source wchar_t string, terminated with a 0 wchar_t.
215 : * @param pszSrcEncoding the source encoding, typically CPL_ENC_UCS2.
216 : * @param pszDstEncoding the destination encoding, typically CPL_ENC_UTF8.
217 : *
218 : * @return a zero terminated multi-byte string which should be freed with
219 : * CPLFree(), or NULL if an error occurs.
220 : */
221 :
222 2248 : char *CPLRecodeFromWCharStub( const wchar_t *pwszSource,
223 : const char *pszSrcEncoding,
224 : const char *pszDstEncoding )
225 :
226 : {
227 : /* -------------------------------------------------------------------- */
228 : /* We try to avoid changes of character set. We are just */
229 : /* providing for unicode to unicode. */
230 : /* -------------------------------------------------------------------- */
231 2248 : if( strcmp(pszSrcEncoding,"WCHAR_T") != 0 &&
232 : strcmp(pszSrcEncoding,CPL_ENC_UTF8) != 0
233 : && strcmp(pszSrcEncoding,CPL_ENC_UTF16) != 0
234 : && strcmp(pszSrcEncoding,CPL_ENC_UCS2) != 0
235 : && strcmp(pszSrcEncoding,CPL_ENC_UCS4) != 0 )
236 : {
237 : CPLError( CE_Failure, CPLE_AppDefined,
238 : "Stub recoding implementation does not support\n"
239 : "CPLRecodeFromWCharStub(...,%s,%s)",
240 0 : pszSrcEncoding, pszDstEncoding );
241 0 : return NULL;
242 : }
243 :
244 : /* -------------------------------------------------------------------- */
245 : /* What is the source length. */
246 : /* -------------------------------------------------------------------- */
247 2248 : int nSrcLen = 0;
248 :
249 22706 : while( pwszSource[nSrcLen] != 0 )
250 18210 : nSrcLen++;
251 :
252 : /* -------------------------------------------------------------------- */
253 : /* Allocate destination buffer plenty big. */
254 : /* -------------------------------------------------------------------- */
255 : char *pszResult;
256 : int nDstBufSize, nDstLen;
257 :
258 2248 : nDstBufSize = nSrcLen * 4 + 1;
259 2248 : pszResult = (char *) CPLMalloc(nDstBufSize); // nearly worst case.
260 :
261 2248 : if (nSrcLen == 0)
262 : {
263 0 : pszResult[0] = '\0';
264 0 : return pszResult;
265 : }
266 :
267 : /* -------------------------------------------------------------------- */
268 : /* Convert, and confirm we had enough space. */
269 : /* -------------------------------------------------------------------- */
270 2248 : nDstLen = utf8fromwc( pszResult, nDstBufSize, pwszSource, nSrcLen );
271 2248 : if( nDstLen >= nDstBufSize - 1 )
272 : {
273 0 : CPLAssert( FALSE ); // too small!
274 0 : return NULL;
275 : }
276 :
277 : /* -------------------------------------------------------------------- */
278 : /* If something other than UTF-8 was requested, recode now. */
279 : /* -------------------------------------------------------------------- */
280 2248 : if( strcmp(pszDstEncoding,CPL_ENC_UTF8) == 0 )
281 2248 : return pszResult;
282 :
283 : char *pszFinalResult =
284 0 : CPLRecodeStub( pszResult, CPL_ENC_UTF8, pszDstEncoding );
285 :
286 0 : CPLFree( pszResult );
287 :
288 0 : return pszFinalResult;
289 : }
290 :
291 : /************************************************************************/
292 : /* CPLRecodeToWCharStub() */
293 : /************************************************************************/
294 :
295 : /**
296 : * Convert UTF-8 string to a wchar_t string.
297 : *
298 : * Convert a 8bit, multi-byte per character input string into a wide
299 : * character (wchar_t) string. The only guaranteed supported source encodings
300 : * are CPL_ENC_UTF8, CPL_ENC_ASCII and CPL_ENC_ISO8869_1 (LATIN1). The only
301 : * guaranteed supported destination encoding is CPL_ENC_UCS2. Other source
302 : * and destination encodings may be supported depending on the underlying
303 : * implementation.
304 : *
305 : * Note that the wchar_t type varies in size on different systems. On
306 : * win32 it is normally 2 bytes, and on unix 4 bytes.
307 : *
308 : * If an error occurs an error may, or may not be posted with CPLError().
309 : *
310 : * @param pszSource input multi-byte character string.
311 : * @param pszSrcEncoding source encoding, typically CPL_ENC_UTF8.
312 : * @param pszDstEncoding destination encoding, typically CPL_ENC_UCS2.
313 : *
314 : * @return the zero terminated wchar_t string (to be freed with CPLFree()) or
315 : * NULL on error.
316 : *
317 : * @since GDAL 1.6.0
318 : */
319 :
320 5140 : wchar_t *CPLRecodeToWCharStub( const char *pszSource,
321 : const char *pszSrcEncoding,
322 : const char *pszDstEncoding )
323 :
324 : {
325 5140 : char *pszUTF8Source = (char *) pszSource;
326 :
327 5140 : if( strcmp(pszSrcEncoding,CPL_ENC_UTF8) != 0
328 : && strcmp(pszSrcEncoding,CPL_ENC_ASCII) != 0 )
329 : {
330 0 : pszUTF8Source = CPLRecodeStub( pszSource, pszSrcEncoding, CPL_ENC_UTF8 );
331 0 : if( pszUTF8Source == NULL )
332 0 : return NULL;
333 : }
334 :
335 : /* -------------------------------------------------------------------- */
336 : /* We try to avoid changes of character set. We are just */
337 : /* providing for unicode to unicode. */
338 : /* -------------------------------------------------------------------- */
339 5140 : if( strcmp(pszDstEncoding,"WCHAR_T") != 0
340 : && strcmp(pszDstEncoding,CPL_ENC_UCS2) != 0
341 : && strcmp(pszDstEncoding,CPL_ENC_UCS4) != 0
342 : && strcmp(pszDstEncoding,CPL_ENC_UTF16) != 0 )
343 : {
344 : CPLError( CE_Failure, CPLE_AppDefined,
345 : "Stub recoding implementation does not support\n"
346 : "CPLRecodeToWCharStub(...,%s,%s)",
347 0 : pszSrcEncoding, pszDstEncoding );
348 0 : return NULL;
349 : }
350 :
351 : /* -------------------------------------------------------------------- */
352 : /* Do the UTF-8 to UCS-2 recoding. */
353 : /* -------------------------------------------------------------------- */
354 5140 : int nSrcLen = strlen(pszUTF8Source);
355 5140 : wchar_t *pwszResult = (wchar_t *) CPLCalloc(sizeof(wchar_t),nSrcLen+1);
356 :
357 5140 : utf8towc( pszUTF8Source, nSrcLen, pwszResult, nSrcLen+1 );
358 :
359 5140 : if( pszUTF8Source != pszSource )
360 0 : CPLFree( pszUTF8Source );
361 :
362 5140 : return pwszResult;
363 : }
364 :
365 :
366 : /************************************************************************/
367 : /* CPLIsUTF8() */
368 : /************************************************************************/
369 :
370 : /**
371 : * Test if a string is encoded as UTF-8.
372 : *
373 : * @param pabyData input string to test
374 : * @param nLen length of the input string, or -1 if the function must compute
375 : * the string length. In which case it must be null terminated.
376 : * @return TRUE if the string is encoded as UTF-8. FALSE otherwise
377 : *
378 : * @since GDAL 1.7.0
379 : */
380 951570 : int CPLIsUTF8Stub(const char* pabyData, int nLen)
381 : {
382 951570 : if (nLen < 0)
383 951570 : nLen = strlen(pabyData);
384 951570 : return utf8test(pabyData, (unsigned)nLen) != 0;
385 : }
386 :
387 : /************************************************************************/
388 : /* ==================================================================== */
389 : /* UTF.C code from FLTK with some modifications. */
390 : /* ==================================================================== */
391 : /************************************************************************/
392 :
393 : /* Set to 1 to turn bad UTF8 bytes into ISO-8859-1. If this is to zero
394 : they are instead turned into the Unicode REPLACEMENT CHARACTER, of
395 : value 0xfffd.
396 : If this is on utf8decode will correctly map most (perhaps all)
397 : human-readable text that is in ISO-8859-1. This may allow you
398 : to completely ignore character sets in your code because virtually
399 : everything is either ISO-8859-1 or UTF-8.
400 : */
401 : #define ERRORS_TO_ISO8859_1 1
402 :
403 : /* Set to 1 to turn bad UTF8 bytes in the 0x80-0x9f range into the
404 : Unicode index for Microsoft's CP1252 character set. You should
405 : also set ERRORS_TO_ISO8859_1. With this a huge amount of more
406 : available text (such as all web pages) are correctly converted
407 : to Unicode.
408 : */
409 : #define ERRORS_TO_CP1252 1
410 :
411 : /* A number of Unicode code points are in fact illegal and should not
412 : be produced by a UTF-8 converter. Turn this on will replace the
413 : bytes in those encodings with errors. If you do this then converting
414 : arbitrary 16-bit data to UTF-8 and then back is not an identity,
415 : which will probably break a lot of software.
416 : */
417 : #define STRICT_RFC3629 0
418 :
419 : #if ERRORS_TO_CP1252
420 : // Codes 0x80..0x9f from the Microsoft CP1252 character set, translated
421 : // to Unicode:
422 : static unsigned short cp1252[32] = {
423 : 0x20ac, 0x0081, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,
424 : 0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008d, 0x017d, 0x008f,
425 : 0x0090, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
426 : 0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x009d, 0x017e, 0x0178
427 : };
428 : #endif
429 :
430 : /************************************************************************/
431 : /* utf8decode() */
432 : /************************************************************************/
433 :
434 : /*
435 : Decode a single UTF-8 encoded character starting at \e p. The
436 : resulting Unicode value (in the range 0-0x10ffff) is returned,
437 : and \e len is set the the number of bytes in the UTF-8 encoding
438 : (adding \e len to \e p will point at the next character).
439 :
440 : If \a p points at an illegal UTF-8 encoding, including one that
441 : would go past \e end, or where a code is uses more bytes than
442 : necessary, then *(unsigned char*)p is translated as though it is
443 : in the Microsoft CP1252 character set and \e len is set to 1.
444 : Treating errors this way allows this to decode almost any
445 : ISO-8859-1 or CP1252 text that has been mistakenly placed where
446 : UTF-8 is expected, and has proven very useful.
447 :
448 : If you want errors to be converted to error characters (as the
449 : standards recommend), adding a test to see if the length is
450 : unexpectedly 1 will work:
451 :
452 : \code
453 : if (*p & 0x80) { // what should be a multibyte encoding
454 : code = utf8decode(p,end,&len);
455 : if (len<2) code = 0xFFFD; // Turn errors into REPLACEMENT CHARACTER
456 : } else { // handle the 1-byte utf8 encoding:
457 : code = *p;
458 : len = 1;
459 : }
460 : \endcode
461 :
462 : Direct testing for the 1-byte case (as shown above) will also
463 : speed up the scanning of strings where the majority of characters
464 : are ASCII.
465 : */
466 400 : static unsigned utf8decode(const char* p, const char* end, int* len)
467 : {
468 400 : unsigned char c = *(unsigned char*)p;
469 400 : if (c < 0x80) {
470 0 : *len = 1;
471 0 : return c;
472 : #if ERRORS_TO_CP1252
473 400 : } else if (c < 0xa0) {
474 2 : *len = 1;
475 2 : return cp1252[c-0x80];
476 : #endif
477 398 : } else if (c < 0xc2) {
478 0 : goto FAIL;
479 : }
480 398 : if (p+1 >= end || (p[1]&0xc0) != 0x80) goto FAIL;
481 398 : if (c < 0xe0) {
482 398 : *len = 2;
483 : return
484 398 : ((p[0] & 0x1f) << 6) +
485 398 : ((p[1] & 0x3f));
486 0 : } else if (c == 0xe0) {
487 0 : if (((unsigned char*)p)[1] < 0xa0) goto FAIL;
488 0 : goto UTF8_3;
489 : #if STRICT_RFC3629
490 : } else if (c == 0xed) {
491 : // RFC 3629 says surrogate chars are illegal.
492 : if (((unsigned char*)p)[1] >= 0xa0) goto FAIL;
493 : goto UTF8_3;
494 : } else if (c == 0xef) {
495 : // 0xfffe and 0xffff are also illegal characters
496 : if (((unsigned char*)p)[1]==0xbf &&
497 : ((unsigned char*)p)[2]>=0xbe) goto FAIL;
498 : goto UTF8_3;
499 : #endif
500 0 : } else if (c < 0xf0) {
501 : UTF8_3:
502 0 : if (p+2 >= end || (p[2]&0xc0) != 0x80) goto FAIL;
503 0 : *len = 3;
504 : return
505 0 : ((p[0] & 0x0f) << 12) +
506 0 : ((p[1] & 0x3f) << 6) +
507 0 : ((p[2] & 0x3f));
508 0 : } else if (c == 0xf0) {
509 0 : if (((unsigned char*)p)[1] < 0x90) goto FAIL;
510 0 : goto UTF8_4;
511 0 : } else if (c < 0xf4) {
512 : UTF8_4:
513 0 : if (p+3 >= end || (p[2]&0xc0) != 0x80 || (p[3]&0xc0) != 0x80) goto FAIL;
514 0 : *len = 4;
515 : #if STRICT_RFC3629
516 : // RFC 3629 says all codes ending in fffe or ffff are illegal:
517 : if ((p[1]&0xf)==0xf &&
518 : ((unsigned char*)p)[2] == 0xbf &&
519 : ((unsigned char*)p)[3] >= 0xbe) goto FAIL;
520 : #endif
521 : return
522 0 : ((p[0] & 0x07) << 18) +
523 0 : ((p[1] & 0x3f) << 12) +
524 0 : ((p[2] & 0x3f) << 6) +
525 0 : ((p[3] & 0x3f));
526 0 : } else if (c == 0xf4) {
527 0 : if (((unsigned char*)p)[1] > 0x8f) goto FAIL; // after 0x10ffff
528 0 : goto UTF8_4;
529 : } else {
530 : FAIL:
531 0 : *len = 1;
532 : #if ERRORS_TO_ISO8859_1
533 0 : return c;
534 : #else
535 : return 0xfffd; // Unicode REPLACEMENT CHARACTER
536 : #endif
537 : }
538 : }
539 :
540 : /************************************************************************/
541 : /* utf8fwd() */
542 : /************************************************************************/
543 :
544 : /*
545 : Move \a p forward until it points to the start of a UTF-8
546 : character. If it already points at the start of one then it
547 : is returned unchanged. Any UTF-8 errors are treated as though each
548 : byte of the error is an individual character.
549 :
550 : \e start is the start of the string and is used to limit the
551 : backwards search for the start of a utf8 character.
552 :
553 : \e end is the end of the string and is assummed to be a break
554 : between characters. It is assummed to be greater than p.
555 :
556 : This function is for moving a pointer that was jumped to the
557 : middle of a string, such as when doing a binary search for
558 : a position. You should use either this or utf8back() depending
559 : on which direction your algorithim can handle the pointer
560 : moving. Do not use this to scan strings, use utf8decode()
561 : instead.
562 : */
563 :
564 : #ifdef FUTURE_NEEDS
565 : static const char* utf8fwd(const char* p, const char* start, const char* end)
566 : {
567 : const char* a;
568 : int len;
569 : // if we are not pointing at a continuation character, we are done:
570 : if ((*p&0xc0) != 0x80) return p;
571 : // search backwards for a 0xc0 starting the character:
572 : for (a = p-1; ; --a) {
573 : if (a < start) return p;
574 : if (!(a[0]&0x80)) return p;
575 : if ((a[0]&0x40)) break;
576 : }
577 : utf8decode(a,end,&len);
578 : a += len;
579 : if (a > p) return a;
580 : return p;
581 : }
582 : #endif /* def FUTURE_NEEDS */
583 :
584 : /************************************************************************/
585 : /* utf8back() */
586 : /************************************************************************/
587 :
588 : /*
589 : Move \a p backward until it points to the start of a UTF-8
590 : character. If it already points at the start of one then it
591 : is returned unchanged. Any UTF-8 errors are treated as though each
592 : byte of the error is an individual character.
593 :
594 : \e start is the start of the string and is used to limit the
595 : backwards search for the start of a UTF-8 character.
596 :
597 : \e end is the end of the string and is assummed to be a break
598 : between characters. It is assummed to be greater than p.
599 :
600 : If you wish to decrement a UTF-8 pointer, pass p-1 to this.
601 : */
602 :
603 : #ifdef FUTURE_NEEDS
604 : static const char* utf8back(const char* p, const char* start, const char* end)
605 : {
606 : const char* a;
607 : int len;
608 : // if we are not pointing at a continuation character, we are done:
609 : if ((*p&0xc0) != 0x80) return p;
610 : // search backwards for a 0xc0 starting the character:
611 : for (a = p-1; ; --a) {
612 : if (a < start) return p;
613 : if (!(a[0]&0x80)) return p;
614 : if ((a[0]&0x40)) break;
615 : }
616 : utf8decode(a,end,&len);
617 : if (a+len > p) return a;
618 : return p;
619 : }
620 : #endif /* def FUTURE_NEEDS */
621 :
622 : /************************************************************************/
623 : /* utf8bytes() */
624 : /************************************************************************/
625 :
626 : /* Returns number of bytes that utf8encode() will use to encode the
627 : character \a ucs. */
628 : #ifdef FUTURE_NEEDS
629 : static int utf8bytes(unsigned ucs) {
630 : if (ucs < 0x000080U) {
631 : return 1;
632 : } else if (ucs < 0x000800U) {
633 : return 2;
634 : } else if (ucs < 0x010000U) {
635 : return 3;
636 : } else if (ucs < 0x10ffffU) {
637 : return 4;
638 : } else {
639 : return 3; // length of the illegal character encoding
640 : }
641 : }
642 : #endif /* def FUTURE_NEEDS */
643 :
644 : /************************************************************************/
645 : /* utf8encode() */
646 : /************************************************************************/
647 :
648 : /* Write the UTF-8 encoding of \e ucs into \e buf and return the
649 : number of bytes written. Up to 4 bytes may be written. If you know
650 : that \a ucs is less than 0x10000 then at most 3 bytes will be written.
651 : If you wish to speed this up, remember that anything less than 0x80
652 : is written as a single byte.
653 :
654 : If ucs is greater than 0x10ffff this is an illegal character
655 : according to RFC 3629. These are converted as though they are
656 : 0xFFFD (REPLACEMENT CHARACTER).
657 :
658 : RFC 3629 also says many other values for \a ucs are illegal (in
659 : the range 0xd800 to 0xdfff, or ending with 0xfffe or
660 : 0xffff). However I encode these as though they are legal, so that
661 : utf8encode/utf8decode will be the identity for all codes between 0
662 : and 0x10ffff.
663 : */
664 : #ifdef FUTURE_NEEDS
665 : static int utf8encode(unsigned ucs, char* buf) {
666 : if (ucs < 0x000080U) {
667 : buf[0] = ucs;
668 : return 1;
669 : } else if (ucs < 0x000800U) {
670 : buf[0] = 0xc0 | (ucs >> 6);
671 : buf[1] = 0x80 | (ucs & 0x3F);
672 : return 2;
673 : } else if (ucs < 0x010000U) {
674 : buf[0] = 0xe0 | (ucs >> 12);
675 : buf[1] = 0x80 | ((ucs >> 6) & 0x3F);
676 : buf[2] = 0x80 | (ucs & 0x3F);
677 : return 3;
678 : } else if (ucs < 0x0010ffffU) {
679 : buf[0] = 0xf0 | (ucs >> 18);
680 : buf[1] = 0x80 | ((ucs >> 12) & 0x3F);
681 : buf[2] = 0x80 | ((ucs >> 6) & 0x3F);
682 : buf[3] = 0x80 | (ucs & 0x3F);
683 : return 4;
684 : } else {
685 : // encode 0xfffd:
686 : buf[0] = 0xefU;
687 : buf[1] = 0xbfU;
688 : buf[2] = 0xbdU;
689 : return 3;
690 : }
691 : }
692 : #endif /* def FUTURE_NEEDS */
693 :
694 : /************************************************************************/
695 : /* utf8towc() */
696 : /************************************************************************/
697 :
698 : /* Convert a UTF-8 sequence into an array of wchar_t. These
699 : are used by some system calls, especially on Windows.
700 :
701 : \a src points at the UTF-8, and \a srclen is the number of bytes to
702 : convert.
703 :
704 : \a dst points at an array to write, and \a dstlen is the number of
705 : locations in this array. At most \a dstlen-1 words will be
706 : written there, plus a 0 terminating word. Thus this function
707 : will never overwrite the buffer and will always return a
708 : zero-terminated string. If \a dstlen is zero then \a dst can be
709 : null and no data is written, but the length is returned.
710 :
711 : The return value is the number of words that \e would be written
712 : to \a dst if it were long enough, not counting the terminating
713 : zero. If the return value is greater or equal to \a dstlen it
714 : indicates truncation, you can then allocate a new array of size
715 : return+1 and call this again.
716 :
717 : Errors in the UTF-8 are converted as though each byte in the
718 : erroneous string is in the Microsoft CP1252 encoding. This allows
719 : ISO-8859-1 text mistakenly identified as UTF-8 to be printed
720 : correctly.
721 :
722 : Notice that sizeof(wchar_t) is 2 on Windows and is 4 on Linux
723 : and most other systems. Where wchar_t is 16 bits, Unicode
724 : characters in the range 0x10000 to 0x10ffff are converted to
725 : "surrogate pairs" which take two words each (this is called UTF-16
726 : encoding). If wchar_t is 32 bits this rather nasty problem is
727 : avoided.
728 : */
729 5140 : static unsigned utf8towc(const char* src, unsigned srclen,
730 : wchar_t* dst, unsigned dstlen)
731 : {
732 5140 : const char* p = src;
733 5140 : const char* e = src+srclen;
734 5140 : unsigned count = 0;
735 24664 : if (dstlen) for (;;) {
736 24664 : if (p >= e) {dst[count] = 0; return count;}
737 19524 : if (!(*p & 0x80)) { // ascii
738 19268 : dst[count] = *p++;
739 : } else {
740 256 : int len; unsigned ucs = utf8decode(p,e,&len);
741 256 : p += len;
742 : #ifdef _WIN32
743 : if (ucs < 0x10000) {
744 : dst[count] = (wchar_t)ucs;
745 : } else {
746 : // make a surrogate pair:
747 : if (count+2 >= dstlen) {dst[count] = 0; count += 2; break;}
748 : dst[count] = (wchar_t)((((ucs-0x10000u)>>10)&0x3ff) | 0xd800);
749 : dst[++count] = (wchar_t)((ucs&0x3ff) | 0xdc00);
750 : }
751 : #else
752 256 : dst[count] = (wchar_t)ucs;
753 : #endif
754 : }
755 19524 : if (++count == dstlen) {dst[count-1] = 0; break;}
756 : }
757 : // we filled dst, measure the rest:
758 0 : while (p < e) {
759 0 : if (!(*p & 0x80)) p++;
760 : else {
761 : #ifdef _WIN32
762 : int len; unsigned ucs = utf8decode(p,e,&len);
763 : p += len;
764 : if (ucs >= 0x10000) ++count;
765 : #else
766 0 : int len; utf8decode(p,e,&len);
767 0 : p += len;
768 : #endif
769 : }
770 0 : ++count;
771 : }
772 0 : return count;
773 : }
774 :
775 : /************************************************************************/
776 : /* utf8toa() */
777 : /************************************************************************/
778 : /* Convert a UTF-8 sequence into an array of 1-byte characters.
779 :
780 : If the UTF-8 decodes to a character greater than 0xff then it is
781 : replaced with '?'.
782 :
783 : Errors in the UTF-8 are converted as individual bytes, same as
784 : utf8decode() does. This allows ISO-8859-1 text mistakenly identified
785 : as UTF-8 to be printed correctly (and possibly CP1512 on Windows).
786 :
787 : \a src points at the UTF-8, and \a srclen is the number of bytes to
788 : convert.
789 :
790 : Up to \a dstlen bytes are written to \a dst, including a null
791 : terminator. The return value is the number of bytes that would be
792 : written, not counting the null terminator. If greater or equal to
793 : \a dstlen then if you malloc a new array of size n+1 you will have
794 : the space needed for the entire string. If \a dstlen is zero then
795 : nothing is written and this call just measures the storage space
796 : needed.
797 : */
798 6674 : static unsigned utf8toa(const char* src, unsigned srclen,
799 : char* dst, unsigned dstlen)
800 : {
801 6674 : const char* p = src;
802 6674 : const char* e = src+srclen;
803 6674 : unsigned count = 0;
804 49204 : if (dstlen) for (;;) {
805 : unsigned char c;
806 49204 : if (p >= e) {dst[count] = 0; return count;}
807 42530 : c = *(unsigned char*)p;
808 42530 : if (c < 0xC2) { // ascii or bad code
809 42530 : dst[count] = c;
810 42530 : p++;
811 : } else {
812 0 : int len; unsigned ucs = utf8decode(p,e,&len);
813 0 : p += len;
814 0 : if (ucs < 0x100) dst[count] = (char)ucs;
815 : else
816 : {
817 : static int bHasWarned = FALSE;
818 0 : if (!bHasWarned)
819 : {
820 0 : bHasWarned = TRUE;
821 : CPLError(CE_Warning, CPLE_AppDefined,
822 : "One or several characters couldn't be converted correctly from UTF-8 to ISO-8859-1.\n"
823 0 : "This warning will not be emitted anymore");
824 : }
825 0 : dst[count] = '?';
826 : }
827 : }
828 42530 : if (++count >= dstlen) {dst[count-1] = 0; break;}
829 : }
830 : // we filled dst, measure the rest:
831 0 : while (p < e) {
832 0 : if (!(*p & 0x80)) p++;
833 : else {
834 : int len;
835 0 : utf8decode(p,e,&len);
836 0 : p += len;
837 : }
838 0 : ++count;
839 : }
840 0 : return count;
841 : }
842 :
843 : /************************************************************************/
844 : /* utf8fromwc() */
845 : /************************************************************************/
846 : /* Turn "wide characters" as returned by some system calls
847 : (especially on Windows) into UTF-8.
848 :
849 : Up to \a dstlen bytes are written to \a dst, including a null
850 : terminator. The return value is the number of bytes that would be
851 : written, not counting the null terminator. If greater or equal to
852 : \a dstlen then if you malloc a new array of size n+1 you will have
853 : the space needed for the entire string. If \a dstlen is zero then
854 : nothing is written and this call just measures the storage space
855 : needed.
856 :
857 : \a srclen is the number of words in \a src to convert. On Windows
858 : this is not necessairly the number of characters, due to there
859 : possibly being "surrogate pairs" in the UTF-16 encoding used.
860 : On Unix wchar_t is 32 bits and each location is a character.
861 :
862 : On Unix if a src word is greater than 0x10ffff then this is an
863 : illegal character according to RFC 3629. These are converted as
864 : though they are 0xFFFD (REPLACEMENT CHARACTER). Characters in the
865 : range 0xd800 to 0xdfff, or ending with 0xfffe or 0xffff are also
866 : illegal according to RFC 3629. However I encode these as though
867 : they are legal, so that utf8towc will return the original data.
868 :
869 : On Windows "surrogate pairs" are converted to a single character
870 : and UTF-8 encoded (as 4 bytes). Mismatched halves of surrogate
871 : pairs are converted as though they are individual characters.
872 : */
873 2248 : static unsigned utf8fromwc(char* dst, unsigned dstlen,
874 : const wchar_t* src, unsigned srclen) {
875 2248 : unsigned i = 0;
876 2248 : unsigned count = 0;
877 20458 : if (dstlen) for (;;) {
878 : unsigned ucs;
879 20458 : if (i >= srclen) {dst[count] = 0; return count;}
880 18210 : ucs = src[i++];
881 18210 : if (ucs < 0x80U) {
882 16470 : dst[count++] = (char)ucs;
883 16470 : if (count >= dstlen) {dst[count-1] = 0; break;}
884 1740 : } else if (ucs < 0x800U) { // 2 bytes
885 1740 : if (count+2 >= dstlen) {dst[count] = 0; count += 2; break;}
886 1740 : dst[count++] = 0xc0 | (char)(ucs >> 6);
887 1740 : dst[count++] = 0x80 | (char)(ucs & 0x3F);
888 : #ifdef _WIN32
889 : } else if (ucs >= 0xd800 && ucs <= 0xdbff && i < srclen &&
890 : src[i] >= 0xdc00 && src[i] <= 0xdfff) {
891 : // surrogate pair
892 : unsigned ucs2 = src[i++];
893 : ucs = 0x10000U + ((ucs&0x3ff)<<10) + (ucs2&0x3ff);
894 : // all surrogate pairs turn into 4-byte utf8
895 : #else
896 0 : } else if (ucs >= 0x10000) {
897 0 : if (ucs > 0x10ffff) {
898 0 : ucs = 0xfffd;
899 0 : goto J1;
900 : }
901 : #endif
902 0 : if (count+4 >= dstlen) {dst[count] = 0; count += 4; break;}
903 0 : dst[count++] = 0xf0 | (char)(ucs >> 18);
904 0 : dst[count++] = 0x80 | (char)((ucs >> 12) & 0x3F);
905 0 : dst[count++] = 0x80 | (char)((ucs >> 6) & 0x3F);
906 0 : dst[count++] = 0x80 | (char)(ucs & 0x3F);
907 : } else {
908 : #ifndef _WIN32
909 : J1:
910 : #endif
911 : // all others are 3 bytes:
912 0 : if (count+3 >= dstlen) {dst[count] = 0; count += 3; break;}
913 0 : dst[count++] = 0xe0 | (char)(ucs >> 12);
914 0 : dst[count++] = 0x80 | (char)((ucs >> 6) & 0x3F);
915 0 : dst[count++] = 0x80 | (char)(ucs & 0x3F);
916 : }
917 : }
918 : // we filled dst, measure the rest:
919 0 : while (i < srclen) {
920 0 : unsigned ucs = src[i++];
921 0 : if (ucs < 0x80U) {
922 0 : count++;
923 0 : } else if (ucs < 0x800U) { // 2 bytes
924 0 : count += 2;
925 : #ifdef _WIN32
926 : } else if (ucs >= 0xd800 && ucs <= 0xdbff && i < srclen-1 &&
927 : src[i+1] >= 0xdc00 && src[i+1] <= 0xdfff) {
928 : // surrogate pair
929 : ++i;
930 : #else
931 0 : } else if (ucs >= 0x10000 && ucs <= 0x10ffff) {
932 : #endif
933 0 : count += 4;
934 : } else {
935 0 : count += 3;
936 : }
937 : }
938 0 : return count;
939 : }
940 :
941 :
942 : /************************************************************************/
943 : /* utf8froma() */
944 : /************************************************************************/
945 :
946 : /* Convert an ISO-8859-1 (ie normal c-string) byte stream to UTF-8.
947 :
948 : It is possible this should convert Microsoft's CP1252 to UTF-8
949 : instead. This would translate the codes in the range 0x80-0x9f
950 : to different characters. Currently it does not do this.
951 :
952 : Up to \a dstlen bytes are written to \a dst, including a null
953 : terminator. The return value is the number of bytes that would be
954 : written, not counting the null terminator. If greater or equal to
955 : \a dstlen then if you malloc a new array of size n+1 you will have
956 : the space needed for the entire string. If \a dstlen is zero then
957 : nothing is written and this call just measures the storage space
958 : needed.
959 :
960 : \a srclen is the number of bytes in \a src to convert.
961 :
962 : If the return value equals \a srclen then this indicates that
963 : no conversion is necessary, as only ASCII characters are in the
964 : string.
965 : */
966 11312 : static unsigned utf8froma(char* dst, unsigned dstlen,
967 : const char* src, unsigned srclen) {
968 11312 : const char* p = src;
969 11312 : const char* e = src+srclen;
970 11312 : unsigned count = 0;
971 86376 : if (dstlen) for (;;) {
972 : unsigned char ucs;
973 86376 : if (p >= e) {dst[count] = 0; return count;}
974 75064 : ucs = *(unsigned char*)p++;
975 75064 : if (ucs < 0x80U) {
976 75032 : dst[count++] = ucs;
977 75032 : if (count >= dstlen) {dst[count-1] = 0; break;}
978 : } else { // 2 bytes (note that CP1252 translate could make 3 bytes!)
979 32 : if (count+2 >= dstlen) {dst[count] = 0; count += 2; break;}
980 32 : dst[count++] = 0xc0 | (ucs >> 6);
981 32 : dst[count++] = 0x80 | (ucs & 0x3F);
982 : }
983 : }
984 : // we filled dst, measure the rest:
985 0 : while (p < e) {
986 0 : unsigned char ucs = *(unsigned char*)p++;
987 0 : if (ucs < 0x80U) {
988 0 : count++;
989 : } else {
990 0 : count += 2;
991 : }
992 : }
993 0 : return count;
994 : }
995 :
996 : /*
997 : ** For now we disable the rest which is locale() related. We may need
998 : ** parts of it later.
999 : */
1000 :
1001 : #ifdef notdef
1002 :
1003 : #ifdef _WIN32
1004 : # include <windows.h>
1005 : #endif
1006 :
1007 : /*! Return true if the "locale" seems to indicate that UTF-8 encoding
1008 : is used. If true the utf8tomb and utf8frommb don't do anything
1009 : useful.
1010 :
1011 : <i>It is highly recommended that you change your system so this
1012 : does return true.</i> On Windows this is done by setting the
1013 : "codepage" to CP_UTF8. On Unix this is done by setting $LC_CTYPE
1014 : to a string containing the letters "utf" or "UTF" in it, or by
1015 : deleting all $LC* and $LANG environment variables. In the future
1016 : it is likely that all non-Asian Unix systems will return true,
1017 : due to the compatability of UTF-8 with ISO-8859-1.
1018 : */
1019 : int utf8locale(void) {
1020 : static int ret = 2;
1021 : if (ret == 2) {
1022 : #ifdef _WIN32
1023 : ret = GetACP() == CP_UTF8;
1024 : #else
1025 : char* s;
1026 : ret = 1; // assumme UTF-8 if no locale
1027 : if (((s = getenv("LC_CTYPE")) && *s) ||
1028 : ((s = getenv("LC_ALL")) && *s) ||
1029 : ((s = getenv("LANG")) && *s)) {
1030 : ret = (strstr(s,"utf") || strstr(s,"UTF"));
1031 : }
1032 : #endif
1033 : }
1034 : return ret;
1035 : }
1036 :
1037 : /*! Convert the UTF-8 used by FLTK to the locale-specific encoding
1038 : used for filenames (and sometimes used for data in files).
1039 : Unfortunatley due to stupid design you will have to do this as
1040 : needed for filenames. This is a bug on both Unix and Windows.
1041 :
1042 : Up to \a dstlen bytes are written to \a dst, including a null
1043 : terminator. The return value is the number of bytes that would be
1044 : written, not counting the null terminator. If greater or equal to
1045 : \a dstlen then if you malloc a new array of size n+1 you will have
1046 : the space needed for the entire string. If \a dstlen is zero then
1047 : nothing is written and this call just measures the storage space
1048 : needed.
1049 :
1050 : If utf8locale() returns true then this does not change the data.
1051 : It is copied and truncated as necessary to
1052 : the destination buffer and \a srclen is always returned. */
1053 : unsigned utf8tomb(const char* src, unsigned srclen,
1054 : char* dst, unsigned dstlen)
1055 : {
1056 : if (!utf8locale()) {
1057 : #ifdef _WIN32
1058 : wchar_t lbuf[1024];
1059 : wchar_t* buf = lbuf;
1060 : unsigned length = utf8towc(src, srclen, buf, 1024);
1061 : unsigned ret;
1062 : if (length >= 1024) {
1063 : buf = (wchar_t*)(malloc((length+1)*sizeof(wchar_t)));
1064 : utf8towc(src, srclen, buf, length+1);
1065 : }
1066 : if (dstlen) {
1067 : // apparently this does not null-terminate, even though msdn
1068 : // documentation claims it does:
1069 : ret =
1070 : WideCharToMultiByte(GetACP(), 0, buf, length, dst, dstlen, 0, 0);
1071 : dst[ret] = 0;
1072 : }
1073 : // if it overflows or measuring length, get the actual length:
1074 : if (dstlen==0 || ret >= dstlen-1)
1075 : ret =
1076 : WideCharToMultiByte(GetACP(), 0, buf, length, 0, 0, 0, 0);
1077 : if (buf != lbuf) free((void*)buf);
1078 : return ret;
1079 : #else
1080 : wchar_t lbuf[1024];
1081 : wchar_t* buf = lbuf;
1082 : unsigned length = utf8towc(src, srclen, buf, 1024);
1083 : int ret;
1084 : if (length >= 1024) {
1085 : buf = (wchar_t*)(malloc((length+1)*sizeof(wchar_t)));
1086 : utf8towc(src, srclen, buf, length+1);
1087 : }
1088 : if (dstlen) {
1089 : ret = wcstombs(dst, buf, dstlen);
1090 : if (ret >= dstlen-1) ret = wcstombs(0,buf,0);
1091 : } else {
1092 : ret = wcstombs(0,buf,0);
1093 : }
1094 : if (buf != lbuf) free((void*)buf);
1095 : if (ret >= 0) return (unsigned)ret;
1096 : // on any errors we return the UTF-8 as raw text...
1097 : #endif
1098 : }
1099 : // identity transform:
1100 : if (srclen < dstlen) {
1101 : memcpy(dst, src, srclen);
1102 : dst[srclen] = 0;
1103 : } else {
1104 : memcpy(dst, src, dstlen-1);
1105 : dst[dstlen-1] = 0;
1106 : }
1107 : return srclen;
1108 : }
1109 :
1110 : /*! Convert a filename from the locale-specific multibyte encoding
1111 : used by Windows to UTF-8 as used by FLTK.
1112 :
1113 : Up to \a dstlen bytes are written to \a dst, including a null
1114 : terminator. The return value is the number of bytes that would be
1115 : written, not counting the null terminator. If greater or equal to
1116 : \a dstlen then if you malloc a new array of size n+1 you will have
1117 : the space needed for the entire string. If \a dstlen is zero then
1118 : nothing is written and this call just measures the storage space
1119 : needed.
1120 :
1121 : On Unix or on Windows when a UTF-8 locale is in effect, this
1122 : does not change the data. It is copied and truncated as necessary to
1123 : the destination buffer and \a srclen is always returned.
1124 : You may also want to check if utf8test() returns non-zero, so that
1125 : the filesystem can store filenames in UTF-8 encoding regardless of
1126 : the locale.
1127 : */
1128 : unsigned utf8frommb(char* dst, unsigned dstlen,
1129 : const char* src, unsigned srclen)
1130 : {
1131 : if (!utf8locale()) {
1132 : #ifdef _WIN32
1133 : wchar_t lbuf[1024];
1134 : wchar_t* buf = lbuf;
1135 : unsigned length;
1136 : unsigned ret;
1137 : length =
1138 : MultiByteToWideChar(GetACP(), 0, src, srclen, buf, 1024);
1139 : if (length >= 1024) {
1140 : length = MultiByteToWideChar(GetACP(), 0, src, srclen, 0, 0);
1141 : buf = (wchar_t*)(malloc(length*sizeof(wchar_t)));
1142 : MultiByteToWideChar(GetACP(), 0, src, srclen, buf, length);
1143 : }
1144 : ret = utf8fromwc(dst, dstlen, buf, length);
1145 : if (buf != lbuf) free((void*)buf);
1146 : return ret;
1147 : #else
1148 : wchar_t lbuf[1024];
1149 : wchar_t* buf = lbuf;
1150 : int length;
1151 : unsigned ret;
1152 : length = mbstowcs(buf, src, 1024);
1153 : if (length >= 1024) {
1154 : length = mbstowcs(0, src, 0)+1;
1155 : buf = (wchar_t*)(malloc(length*sizeof(unsigned short)));
1156 : mbstowcs(buf, src, length);
1157 : }
1158 : if (length >= 0) {
1159 : ret = utf8fromwc(dst, dstlen, buf, length);
1160 : if (buf != lbuf) free((void*)buf);
1161 : return ret;
1162 : }
1163 : // errors in conversion return the UTF-8 unchanged
1164 : #endif
1165 : }
1166 : // identity transform:
1167 : if (srclen < dstlen) {
1168 : memcpy(dst, src, srclen);
1169 : dst[srclen] = 0;
1170 : } else {
1171 : memcpy(dst, src, dstlen-1);
1172 : dst[dstlen-1] = 0;
1173 : }
1174 : return srclen;
1175 : }
1176 :
1177 : #endif /* def notdef - disabled locale specific stuff */
1178 :
1179 : /*! Examines the first \a srclen bytes in \a src and return a verdict
1180 : on whether it is UTF-8 or not.
1181 : - Returns 0 if there is any illegal UTF-8 sequences, using the
1182 : same rules as utf8decode(). Note that some UCS values considered
1183 : illegal by RFC 3629, such as 0xffff, are considered legal by this.
1184 : - Returns 1 if there are only single-byte characters (ie no bytes
1185 : have the high bit set). This is legal UTF-8, but also indicates
1186 : plain ASCII. It also returns 1 if \a srclen is zero.
1187 : - Returns 2 if there are only characters less than 0x800.
1188 : - Returns 3 if there are only characters less than 0x10000.
1189 : - Returns 4 if there are characters in the 0x10000 to 0x10ffff range.
1190 :
1191 : Because there are many illegal sequences in UTF-8, it is almost
1192 : impossible for a string in another encoding to be confused with
1193 : UTF-8. This is very useful for transitioning Unix to UTF-8
1194 : filenames, you can simply test each filename with this to decide
1195 : if it is UTF-8 or in the locale encoding. My hope is that if
1196 : this is done we will be able to cleanly transition to a locale-less
1197 : encoding.
1198 : */
1199 :
1200 951570 : static int utf8test(const char* src, unsigned srclen) {
1201 951570 : int ret = 1;
1202 951570 : const char* p = src;
1203 951570 : const char* e = src+srclen;
1204 7443736 : while (p < e) {
1205 5540598 : if (*p & 0x80) {
1206 144 : int len; utf8decode(p,e,&len);
1207 144 : if (len < 2) return 0;
1208 142 : if (len > ret) ret = len;
1209 142 : p += len;
1210 : } else {
1211 5540454 : p++;
1212 : }
1213 : }
1214 951568 : return ret;
1215 : }
1216 :
1217 : #endif /* defined(CPL_RECODE_STUB) */
|