1 : /**********************************************************************
2 : * $Id: cpl_recode_stub.cpp 24557 2012-06-10 10:22:49Z rouault $
3 : *
4 : * Name: cpl_recode_stub.cpp
5 : * Project: CPL - Common Portability Library
6 : * Purpose: Character set recoding and char/wchar_t conversions, stub
7 : * implementation to be used if iconv() functionality is not
8 : * available.
9 : * Author: Frank Warmerdam, warmerdam@pobox.com
10 : *
11 : * The bulk of this code is derived from the utf.c module from FLTK. It
12 : * was originally downloaded from:
13 : * http://svn.easysw.com/public/fltk/fltk/trunk/src/utf.c
14 : *
15 : **********************************************************************
16 : * Copyright (c) 2008, Frank Warmerdam
17 : * Copyright 2006 by Bill Spitzak and others.
18 : *
19 : * Permission to use, copy, modify, and distribute this software for any
20 : * purpose with or without fee is hereby granted, provided that the above
21 : * copyright notice and this permission notice appear in all copies.
22 : *
23 : * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
24 : * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
25 : * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
26 : * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
27 : * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
28 : * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
29 : * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
30 : **********************************************************************/
31 :
32 : #include "cpl_string.h"
33 :
34 : CPL_CVSID("$Id: cpl_recode_stub.cpp 24557 2012-06-10 10:22:49Z rouault $");
35 :
36 : #ifdef CPL_RECODE_STUB
37 :
38 : static unsigned utf8decode(const char* p, const char* end, int* len);
39 : static unsigned utf8towc(const char* src, unsigned srclen,
40 : wchar_t* dst, unsigned dstlen);
41 : static unsigned utf8toa(const char* src, unsigned srclen,
42 : char* dst, unsigned dstlen);
43 : static unsigned utf8fromwc(char* dst, unsigned dstlen,
44 : const wchar_t* src, unsigned srclen);
45 : static unsigned utf8froma(char* dst, unsigned dstlen,
46 : const char* src, unsigned srclen);
47 : static int utf8test(const char* src, unsigned srclen);
48 :
49 : #ifdef _WIN32
50 :
51 : #include <windows.h>
52 : #include <winnls.h>
53 :
54 : static char* CPLWin32Recode( const char* src,
55 : unsigned src_code_page, unsigned dst_code_page );
56 : #endif
57 :
58 : #ifdef FUTURE_NEEDS
59 : static const char* utf8fwd(const char* p, const char* start, const char* end);
60 : static const char* utf8back(const char* p, const char* start, const char*end);
61 : static int utf8encode(unsigned ucs, char* buf);
62 : static int utf8bytes(unsigned ucs);
63 : #endif /* def FUTURE_NEEDS */
64 :
65 : /************************************************************************/
66 : /* ==================================================================== */
67 : /* Stub Implementation not depending on iconv() or WIN32 API. */
68 : /* ==================================================================== */
69 : /************************************************************************/
70 :
71 : static int bHaveWarned1 = FALSE;
72 : static int bHaveWarned2 = FALSE;
73 : static int bHaveWarned3 = FALSE;
74 : static int bHaveWarned4 = FALSE;
75 : static int bHaveWarned5 = FALSE;
76 : static int bHaveWarned6 = FALSE;
77 :
78 : /************************************************************************/
79 : /* CPLClearRecodeStubWarningFlags() */
80 : /************************************************************************/
81 :
82 6786 : void CPLClearRecodeStubWarningFlags()
83 : {
84 6786 : bHaveWarned1 = FALSE;
85 6786 : bHaveWarned2 = FALSE;
86 6786 : bHaveWarned3 = FALSE;
87 6786 : bHaveWarned4 = FALSE;
88 6786 : bHaveWarned5 = FALSE;
89 6786 : bHaveWarned6 = FALSE;
90 6786 : }
91 :
92 : /************************************************************************/
93 : /* CPLRecodeStub() */
94 : /************************************************************************/
95 :
96 : /**
97 : * Convert a string from a source encoding to a destination encoding.
98 : *
99 : * The only guaranteed supported encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
100 : * and CPL_ENC_ISO8859_1. Currently, the following conversions are supported :
101 : * <ul>
102 : * <li>CPL_ENC_ASCII -> CPL_ENC_UTF8 or CPL_ENC_ISO8859_1 (no conversion in fact)</li>
103 : * <li>CPL_ENC_ISO8859_1 -> CPL_ENC_UTF8</li>
104 : * <li>CPL_ENC_UTF8 -> CPL_ENC_ISO8859_1</li>
105 : * </ul>
106 : *
107 : * If an error occurs an error may, or may not be posted with CPLError().
108 : *
109 : * @param pszSource a NULL terminated string.
110 : * @param pszSrcEncoding the source encoding.
111 : * @param pszDstEncoding the destination encoding.
112 : *
113 : * @return a NULL terminated string which should be freed with CPLFree().
114 : */
115 :
116 55342 : char *CPLRecodeStub( const char *pszSource,
117 : const char *pszSrcEncoding,
118 : const char *pszDstEncoding )
119 :
120 : {
121 : /* -------------------------------------------------------------------- */
122 : /* If the source or destination is current locale(), we change */
123 : /* it to ISO8859-1 since our stub implementation does not */
124 : /* attempt to address locales properly. */
125 : /* -------------------------------------------------------------------- */
126 :
127 55342 : if( pszSrcEncoding[0] == '\0' )
128 0 : pszSrcEncoding = CPL_ENC_ISO8859_1;
129 :
130 55342 : if( pszDstEncoding[0] == '\0' )
131 0 : pszDstEncoding = CPL_ENC_ISO8859_1;
132 :
133 : /* -------------------------------------------------------------------- */
134 : /* ISO8859 to UTF8 */
135 : /* -------------------------------------------------------------------- */
136 55342 : if( strcmp(pszSrcEncoding,CPL_ENC_ISO8859_1) == 0
137 : && strcmp(pszDstEncoding,CPL_ENC_UTF8) == 0 )
138 : {
139 47538 : int nCharCount = strlen(pszSource);
140 47538 : char *pszResult = (char *) CPLCalloc(1,nCharCount*2+1);
141 :
142 47538 : utf8froma( pszResult, nCharCount*2+1, pszSource, nCharCount );
143 :
144 47538 : return pszResult;
145 : }
146 :
147 : /* -------------------------------------------------------------------- */
148 : /* UTF8 to ISO8859 */
149 : /* -------------------------------------------------------------------- */
150 7804 : if( strcmp(pszSrcEncoding,CPL_ENC_UTF8) == 0
151 : && strcmp(pszDstEncoding,CPL_ENC_ISO8859_1) == 0 )
152 : {
153 7804 : int nCharCount = strlen(pszSource);
154 7804 : char *pszResult = (char *) CPLCalloc(1,nCharCount+1);
155 :
156 7804 : utf8toa( pszSource, nCharCount, pszResult, nCharCount+1 );
157 :
158 7804 : return pszResult;
159 : }
160 :
161 : #ifdef _WIN32
162 : /* ---------------------------------------------------------------------*/
163 : /* CPXXX to UTF8 */
164 : /* ---------------------------------------------------------------------*/
165 : if( strncmp(pszSrcEncoding,"CP",2) == 0
166 : && strcmp(pszDstEncoding,CPL_ENC_UTF8) == 0 )
167 : {
168 : int nCode = atoi( pszSrcEncoding + 2 );
169 : if( nCode > 0 ) {
170 : return CPLWin32Recode( pszSource, nCode, CP_UTF8 );
171 : }
172 : }
173 :
174 : /* ---------------------------------------------------------------------*/
175 : /* UTF8 to CPXXX */
176 : /* ---------------------------------------------------------------------*/
177 : if( strcmp(pszSrcEncoding,CPL_ENC_UTF8) == 0
178 : && strncmp(pszDstEncoding,"CP",2) == 0 )
179 : {
180 : int nCode = atoi( pszDstEncoding + 2 );
181 : if( nCode > 0 ) {
182 : return CPLWin32Recode( pszSource, CP_UTF8, nCode );
183 : }
184 : }
185 : #endif
186 :
187 : /* -------------------------------------------------------------------- */
188 : /* Anything else to UTF-8 is treated as ISO8859-1 to UTF-8 with */
189 : /* a one-time warning. */
190 : /* -------------------------------------------------------------------- */
191 0 : if( strcmp(pszDstEncoding,CPL_ENC_UTF8) == 0 )
192 : {
193 0 : int nCharCount = strlen(pszSource);
194 0 : char *pszResult = (char *) CPLCalloc(1,nCharCount*2+1);
195 :
196 0 : if( !bHaveWarned1 )
197 : {
198 0 : bHaveWarned1 = 1;
199 : CPLError( CE_Warning, CPLE_AppDefined,
200 : "Recode from %s to UTF-8 not supported, treated as ISO8859-1 to UTF-8.",
201 0 : pszSrcEncoding );
202 : }
203 :
204 0 : utf8froma( pszResult, nCharCount*2+1, pszSource, nCharCount );
205 :
206 0 : return pszResult;
207 : }
208 :
209 : /* -------------------------------------------------------------------- */
210 : /* UTF-8 to anything else is treated as UTF-8 to ISO-8859-1 */
211 : /* with a warning. */
212 : /* -------------------------------------------------------------------- */
213 0 : if( strcmp(pszSrcEncoding,CPL_ENC_UTF8) == 0
214 : && strcmp(pszDstEncoding,CPL_ENC_ISO8859_1) == 0 )
215 : {
216 0 : int nCharCount = strlen(pszSource);
217 0 : char *pszResult = (char *) CPLCalloc(1,nCharCount+1);
218 :
219 0 : if( !bHaveWarned2 )
220 : {
221 0 : bHaveWarned2 = 1;
222 : CPLError( CE_Warning, CPLE_AppDefined,
223 : "Recode from UTF-8 to %s not supported, treated as UTF-8 to ISO8859-1.",
224 0 : pszDstEncoding );
225 : }
226 :
227 0 : utf8toa( pszSource, nCharCount, pszResult, nCharCount+1 );
228 :
229 0 : return pszResult;
230 : }
231 :
232 : /* -------------------------------------------------------------------- */
233 : /* Everything else is treated as a no-op with a warning. */
234 : /* -------------------------------------------------------------------- */
235 : {
236 0 : if( !bHaveWarned3 )
237 : {
238 0 : bHaveWarned3 = 1;
239 : CPLError( CE_Warning, CPLE_AppDefined,
240 : "Recode from %s to %s not supported, no change applied.",
241 0 : pszSrcEncoding, pszDstEncoding );
242 : }
243 :
244 0 : return CPLStrdup(pszSource);
245 : }
246 : }
247 :
248 : /************************************************************************/
249 : /* CPLRecodeFromWCharStub() */
250 : /************************************************************************/
251 :
252 : /**
253 : * Convert wchar_t string to UTF-8.
254 : *
255 : * Convert a wchar_t string into a multibyte utf-8 string. The only
256 : * guaranteed supported source encoding is CPL_ENC_UCS2, and the only
257 : * guaranteed supported destination encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
258 : * and CPL_ENC_ISO8859_1. In some cases (ie. using iconv()) other encodings
259 : * may also be supported.
260 : *
261 : * Note that the wchar_t type varies in size on different systems. On
262 : * win32 it is normally 2 bytes, and on unix 4 bytes.
263 : *
264 : * If an error occurs an error may, or may not be posted with CPLError().
265 : *
266 : * @param pwszSource the source wchar_t string, terminated with a 0 wchar_t.
267 : * @param pszSrcEncoding the source encoding, typically CPL_ENC_UCS2.
268 : * @param pszDstEncoding the destination encoding, typically CPL_ENC_UTF8.
269 : *
270 : * @return a zero terminated multi-byte string which should be freed with
271 : * CPLFree(), or NULL if an error occurs.
272 : */
273 :
274 12605 : char *CPLRecodeFromWCharStub( const wchar_t *pwszSource,
275 : const char *pszSrcEncoding,
276 : const char *pszDstEncoding )
277 :
278 : {
279 : /* -------------------------------------------------------------------- */
280 : /* We try to avoid changes of character set. We are just */
281 : /* providing for unicode to unicode. */
282 : /* -------------------------------------------------------------------- */
283 12605 : if( strcmp(pszSrcEncoding,"WCHAR_T") != 0 &&
284 : strcmp(pszSrcEncoding,CPL_ENC_UTF8) != 0
285 : && strcmp(pszSrcEncoding,CPL_ENC_UTF16) != 0
286 : && strcmp(pszSrcEncoding,CPL_ENC_UCS2) != 0
287 : && strcmp(pszSrcEncoding,CPL_ENC_UCS4) != 0 )
288 : {
289 : CPLError( CE_Failure, CPLE_AppDefined,
290 : "Stub recoding implementation does not support\n"
291 : "CPLRecodeFromWCharStub(...,%s,%s)",
292 0 : pszSrcEncoding, pszDstEncoding );
293 0 : return NULL;
294 : }
295 :
296 : /* -------------------------------------------------------------------- */
297 : /* What is the source length. */
298 : /* -------------------------------------------------------------------- */
299 12605 : int nSrcLen = 0;
300 :
301 4355134 : while( pwszSource[nSrcLen] != 0 )
302 4329924 : nSrcLen++;
303 :
304 : /* -------------------------------------------------------------------- */
305 : /* Allocate destination buffer plenty big. */
306 : /* -------------------------------------------------------------------- */
307 : char *pszResult;
308 : int nDstBufSize, nDstLen;
309 :
310 12605 : nDstBufSize = nSrcLen * 4 + 1;
311 12605 : pszResult = (char *) CPLMalloc(nDstBufSize); // nearly worst case.
312 :
313 12605 : if (nSrcLen == 0)
314 : {
315 0 : pszResult[0] = '\0';
316 0 : return pszResult;
317 : }
318 :
319 : /* -------------------------------------------------------------------- */
320 : /* Convert, and confirm we had enough space. */
321 : /* -------------------------------------------------------------------- */
322 12605 : nDstLen = utf8fromwc( pszResult, nDstBufSize, pwszSource, nSrcLen );
323 12605 : if( nDstLen >= nDstBufSize - 1 )
324 : {
325 0 : CPLAssert( FALSE ); // too small!
326 0 : return NULL;
327 : }
328 :
329 : /* -------------------------------------------------------------------- */
330 : /* If something other than UTF-8 was requested, recode now. */
331 : /* -------------------------------------------------------------------- */
332 12605 : if( strcmp(pszDstEncoding,CPL_ENC_UTF8) == 0 )
333 12605 : return pszResult;
334 :
335 : char *pszFinalResult =
336 0 : CPLRecodeStub( pszResult, CPL_ENC_UTF8, pszDstEncoding );
337 :
338 0 : CPLFree( pszResult );
339 :
340 0 : return pszFinalResult;
341 : }
342 :
343 : /************************************************************************/
344 : /* CPLRecodeToWCharStub() */
345 : /************************************************************************/
346 :
347 : /**
348 : * Convert UTF-8 string to a wchar_t string.
349 : *
350 : * Convert a 8bit, multi-byte per character input string into a wide
351 : * character (wchar_t) string. The only guaranteed supported source encodings
352 : * are CPL_ENC_UTF8, CPL_ENC_ASCII and CPL_ENC_ISO8869_1 (LATIN1). The only
353 : * guaranteed supported destination encoding is CPL_ENC_UCS2. Other source
354 : * and destination encodings may be supported depending on the underlying
355 : * implementation.
356 : *
357 : * Note that the wchar_t type varies in size on different systems. On
358 : * win32 it is normally 2 bytes, and on unix 4 bytes.
359 : *
360 : * If an error occurs an error may, or may not be posted with CPLError().
361 : *
362 : * @param pszSource input multi-byte character string.
363 : * @param pszSrcEncoding source encoding, typically CPL_ENC_UTF8.
364 : * @param pszDstEncoding destination encoding, typically CPL_ENC_UCS2.
365 : *
366 : * @return the zero terminated wchar_t string (to be freed with CPLFree()) or
367 : * NULL on error.
368 : *
369 : * @since GDAL 1.6.0
370 : */
371 :
372 2933 : wchar_t *CPLRecodeToWCharStub( const char *pszSource,
373 : const char *pszSrcEncoding,
374 : const char *pszDstEncoding )
375 :
376 : {
377 2933 : char *pszUTF8Source = (char *) pszSource;
378 :
379 2933 : if( strcmp(pszSrcEncoding,CPL_ENC_UTF8) != 0
380 : && strcmp(pszSrcEncoding,CPL_ENC_ASCII) != 0 )
381 : {
382 0 : pszUTF8Source = CPLRecodeStub( pszSource, pszSrcEncoding, CPL_ENC_UTF8 );
383 0 : if( pszUTF8Source == NULL )
384 0 : return NULL;
385 : }
386 :
387 : /* -------------------------------------------------------------------- */
388 : /* We try to avoid changes of character set. We are just */
389 : /* providing for unicode to unicode. */
390 : /* -------------------------------------------------------------------- */
391 2933 : if( strcmp(pszDstEncoding,"WCHAR_T") != 0
392 : && strcmp(pszDstEncoding,CPL_ENC_UCS2) != 0
393 : && strcmp(pszDstEncoding,CPL_ENC_UCS4) != 0
394 : && strcmp(pszDstEncoding,CPL_ENC_UTF16) != 0 )
395 : {
396 : CPLError( CE_Failure, CPLE_AppDefined,
397 : "Stub recoding implementation does not support\n"
398 : "CPLRecodeToWCharStub(...,%s,%s)",
399 0 : pszSrcEncoding, pszDstEncoding );
400 0 : return NULL;
401 : }
402 :
403 : /* -------------------------------------------------------------------- */
404 : /* Do the UTF-8 to UCS-2 recoding. */
405 : /* -------------------------------------------------------------------- */
406 2933 : int nSrcLen = strlen(pszUTF8Source);
407 2933 : wchar_t *pwszResult = (wchar_t *) CPLCalloc(sizeof(wchar_t),nSrcLen+1);
408 :
409 2933 : utf8towc( pszUTF8Source, nSrcLen, pwszResult, nSrcLen+1 );
410 :
411 2933 : if( pszUTF8Source != pszSource )
412 0 : CPLFree( pszUTF8Source );
413 :
414 2933 : return pwszResult;
415 : }
416 :
417 :
418 : /************************************************************************/
419 : /* CPLIsUTF8() */
420 : /************************************************************************/
421 :
422 : /**
423 : * Test if a string is encoded as UTF-8.
424 : *
425 : * @param pabyData input string to test
426 : * @param nLen length of the input string, or -1 if the function must compute
427 : * the string length. In which case it must be null terminated.
428 : * @return TRUE if the string is encoded as UTF-8. FALSE otherwise
429 : *
430 : * @since GDAL 1.7.0
431 : */
432 476054 : int CPLIsUTF8Stub(const char* pabyData, int nLen)
433 : {
434 476054 : if (nLen < 0)
435 476054 : nLen = strlen(pabyData);
436 476054 : return utf8test(pabyData, (unsigned)nLen) != 0;
437 : }
438 :
439 : /************************************************************************/
440 : /* ==================================================================== */
441 : /* UTF.C code from FLTK with some modifications. */
442 : /* ==================================================================== */
443 : /************************************************************************/
444 :
445 : /* Set to 1 to turn bad UTF8 bytes into ISO-8859-1. If this is to zero
446 : they are instead turned into the Unicode REPLACEMENT CHARACTER, of
447 : value 0xfffd.
448 : If this is on utf8decode will correctly map most (perhaps all)
449 : human-readable text that is in ISO-8859-1. This may allow you
450 : to completely ignore character sets in your code because virtually
451 : everything is either ISO-8859-1 or UTF-8.
452 : */
453 : #define ERRORS_TO_ISO8859_1 1
454 :
455 : /* Set to 1 to turn bad UTF8 bytes in the 0x80-0x9f range into the
456 : Unicode index for Microsoft's CP1252 character set. You should
457 : also set ERRORS_TO_ISO8859_1. With this a huge amount of more
458 : available text (such as all web pages) are correctly converted
459 : to Unicode.
460 : */
461 : #define ERRORS_TO_CP1252 1
462 :
463 : /* A number of Unicode code points are in fact illegal and should not
464 : be produced by a UTF-8 converter. Turn this on will replace the
465 : bytes in those encodings with errors. If you do this then converting
466 : arbitrary 16-bit data to UTF-8 and then back is not an identity,
467 : which will probably break a lot of software.
468 : */
469 : #define STRICT_RFC3629 0
470 :
471 : #if ERRORS_TO_CP1252
472 : // Codes 0x80..0x9f from the Microsoft CP1252 character set, translated
473 : // to Unicode:
474 : static unsigned short cp1252[32] = {
475 : 0x20ac, 0x0081, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,
476 : 0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008d, 0x017d, 0x008f,
477 : 0x0090, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
478 : 0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x009d, 0x017e, 0x0178
479 : };
480 : #endif
481 :
482 : /************************************************************************/
483 : /* utf8decode() */
484 : /************************************************************************/
485 :
486 : /*
487 : Decode a single UTF-8 encoded character starting at \e p. The
488 : resulting Unicode value (in the range 0-0x10ffff) is returned,
489 : and \e len is set the the number of bytes in the UTF-8 encoding
490 : (adding \e len to \e p will point at the next character).
491 :
492 : If \a p points at an illegal UTF-8 encoding, including one that
493 : would go past \e end, or where a code is uses more bytes than
494 : necessary, then *(unsigned char*)p is translated as though it is
495 : in the Microsoft CP1252 character set and \e len is set to 1.
496 : Treating errors this way allows this to decode almost any
497 : ISO-8859-1 or CP1252 text that has been mistakenly placed where
498 : UTF-8 is expected, and has proven very useful.
499 :
500 : If you want errors to be converted to error characters (as the
501 : standards recommend), adding a test to see if the length is
502 : unexpectedly 1 will work:
503 :
504 : \code
505 : if (*p & 0x80) { // what should be a multibyte encoding
506 : code = utf8decode(p,end,&len);
507 : if (len<2) code = 0xFFFD; // Turn errors into REPLACEMENT CHARACTER
508 : } else { // handle the 1-byte utf8 encoding:
509 : code = *p;
510 : len = 1;
511 : }
512 : \endcode
513 :
514 : Direct testing for the 1-byte case (as shown above) will also
515 : speed up the scanning of strings where the majority of characters
516 : are ASCII.
517 : */
518 243 : static unsigned utf8decode(const char* p, const char* end, int* len)
519 : {
520 243 : unsigned char c = *(unsigned char*)p;
521 243 : if (c < 0x80) {
522 0 : *len = 1;
523 0 : return c;
524 : #if ERRORS_TO_CP1252
525 243 : } else if (c < 0xa0) {
526 1 : *len = 1;
527 1 : return cp1252[c-0x80];
528 : #endif
529 242 : } else if (c < 0xc2) {
530 0 : goto FAIL;
531 : }
532 242 : if (p+1 >= end || (p[1]&0xc0) != 0x80) goto FAIL;
533 242 : if (c < 0xe0) {
534 238 : *len = 2;
535 : return
536 238 : ((p[0] & 0x1f) << 6) +
537 238 : ((p[1] & 0x3f));
538 4 : } else if (c == 0xe0) {
539 0 : if (((unsigned char*)p)[1] < 0xa0) goto FAIL;
540 0 : goto UTF8_3;
541 : #if STRICT_RFC3629
542 : } else if (c == 0xed) {
543 : // RFC 3629 says surrogate chars are illegal.
544 : if (((unsigned char*)p)[1] >= 0xa0) goto FAIL;
545 : goto UTF8_3;
546 : } else if (c == 0xef) {
547 : // 0xfffe and 0xffff are also illegal characters
548 : if (((unsigned char*)p)[1]==0xbf &&
549 : ((unsigned char*)p)[2]>=0xbe) goto FAIL;
550 : goto UTF8_3;
551 : #endif
552 4 : } else if (c < 0xf0) {
553 : UTF8_3:
554 4 : if (p+2 >= end || (p[2]&0xc0) != 0x80) goto FAIL;
555 4 : *len = 3;
556 : return
557 4 : ((p[0] & 0x0f) << 12) +
558 4 : ((p[1] & 0x3f) << 6) +
559 8 : ((p[2] & 0x3f));
560 0 : } else if (c == 0xf0) {
561 0 : if (((unsigned char*)p)[1] < 0x90) goto FAIL;
562 0 : goto UTF8_4;
563 0 : } else if (c < 0xf4) {
564 : UTF8_4:
565 0 : if (p+3 >= end || (p[2]&0xc0) != 0x80 || (p[3]&0xc0) != 0x80) goto FAIL;
566 0 : *len = 4;
567 : #if STRICT_RFC3629
568 : // RFC 3629 says all codes ending in fffe or ffff are illegal:
569 : if ((p[1]&0xf)==0xf &&
570 : ((unsigned char*)p)[2] == 0xbf &&
571 : ((unsigned char*)p)[3] >= 0xbe) goto FAIL;
572 : #endif
573 : return
574 0 : ((p[0] & 0x07) << 18) +
575 0 : ((p[1] & 0x3f) << 12) +
576 0 : ((p[2] & 0x3f) << 6) +
577 0 : ((p[3] & 0x3f));
578 0 : } else if (c == 0xf4) {
579 0 : if (((unsigned char*)p)[1] > 0x8f) goto FAIL; // after 0x10ffff
580 0 : goto UTF8_4;
581 : } else {
582 : FAIL:
583 0 : *len = 1;
584 : #if ERRORS_TO_ISO8859_1
585 0 : return c;
586 : #else
587 : return 0xfffd; // Unicode REPLACEMENT CHARACTER
588 : #endif
589 : }
590 : }
591 :
592 : /************************************************************************/
593 : /* utf8fwd() */
594 : /************************************************************************/
595 :
596 : /*
597 : Move \a p forward until it points to the start of a UTF-8
598 : character. If it already points at the start of one then it
599 : is returned unchanged. Any UTF-8 errors are treated as though each
600 : byte of the error is an individual character.
601 :
602 : \e start is the start of the string and is used to limit the
603 : backwards search for the start of a utf8 character.
604 :
605 : \e end is the end of the string and is assummed to be a break
606 : between characters. It is assummed to be greater than p.
607 :
608 : This function is for moving a pointer that was jumped to the
609 : middle of a string, such as when doing a binary search for
610 : a position. You should use either this or utf8back() depending
611 : on which direction your algorithim can handle the pointer
612 : moving. Do not use this to scan strings, use utf8decode()
613 : instead.
614 : */
615 :
616 : #ifdef FUTURE_NEEDS
617 : static const char* utf8fwd(const char* p, const char* start, const char* end)
618 : {
619 : const char* a;
620 : int len;
621 : // if we are not pointing at a continuation character, we are done:
622 : if ((*p&0xc0) != 0x80) return p;
623 : // search backwards for a 0xc0 starting the character:
624 : for (a = p-1; ; --a) {
625 : if (a < start) return p;
626 : if (!(a[0]&0x80)) return p;
627 : if ((a[0]&0x40)) break;
628 : }
629 : utf8decode(a,end,&len);
630 : a += len;
631 : if (a > p) return a;
632 : return p;
633 : }
634 : #endif /* def FUTURE_NEEDS */
635 :
636 : /************************************************************************/
637 : /* utf8back() */
638 : /************************************************************************/
639 :
640 : /*
641 : Move \a p backward until it points to the start of a UTF-8
642 : character. If it already points at the start of one then it
643 : is returned unchanged. Any UTF-8 errors are treated as though each
644 : byte of the error is an individual character.
645 :
646 : \e start is the start of the string and is used to limit the
647 : backwards search for the start of a UTF-8 character.
648 :
649 : \e end is the end of the string and is assummed to be a break
650 : between characters. It is assummed to be greater than p.
651 :
652 : If you wish to decrement a UTF-8 pointer, pass p-1 to this.
653 : */
654 :
655 : #ifdef FUTURE_NEEDS
656 : static const char* utf8back(const char* p, const char* start, const char* end)
657 : {
658 : const char* a;
659 : int len;
660 : // if we are not pointing at a continuation character, we are done:
661 : if ((*p&0xc0) != 0x80) return p;
662 : // search backwards for a 0xc0 starting the character:
663 : for (a = p-1; ; --a) {
664 : if (a < start) return p;
665 : if (!(a[0]&0x80)) return p;
666 : if ((a[0]&0x40)) break;
667 : }
668 : utf8decode(a,end,&len);
669 : if (a+len > p) return a;
670 : return p;
671 : }
672 : #endif /* def FUTURE_NEEDS */
673 :
674 : /************************************************************************/
675 : /* utf8bytes() */
676 : /************************************************************************/
677 :
678 : /* Returns number of bytes that utf8encode() will use to encode the
679 : character \a ucs. */
680 : #ifdef FUTURE_NEEDS
681 : static int utf8bytes(unsigned ucs) {
682 : if (ucs < 0x000080U) {
683 : return 1;
684 : } else if (ucs < 0x000800U) {
685 : return 2;
686 : } else if (ucs < 0x010000U) {
687 : return 3;
688 : } else if (ucs < 0x10ffffU) {
689 : return 4;
690 : } else {
691 : return 3; // length of the illegal character encoding
692 : }
693 : }
694 : #endif /* def FUTURE_NEEDS */
695 :
696 : /************************************************************************/
697 : /* utf8encode() */
698 : /************************************************************************/
699 :
700 : /* Write the UTF-8 encoding of \e ucs into \e buf and return the
701 : number of bytes written. Up to 4 bytes may be written. If you know
702 : that \a ucs is less than 0x10000 then at most 3 bytes will be written.
703 : If you wish to speed this up, remember that anything less than 0x80
704 : is written as a single byte.
705 :
706 : If ucs is greater than 0x10ffff this is an illegal character
707 : according to RFC 3629. These are converted as though they are
708 : 0xFFFD (REPLACEMENT CHARACTER).
709 :
710 : RFC 3629 also says many other values for \a ucs are illegal (in
711 : the range 0xd800 to 0xdfff, or ending with 0xfffe or
712 : 0xffff). However I encode these as though they are legal, so that
713 : utf8encode/utf8decode will be the identity for all codes between 0
714 : and 0x10ffff.
715 : */
716 : #ifdef FUTURE_NEEDS
717 : static int utf8encode(unsigned ucs, char* buf) {
718 : if (ucs < 0x000080U) {
719 : buf[0] = ucs;
720 : return 1;
721 : } else if (ucs < 0x000800U) {
722 : buf[0] = 0xc0 | (ucs >> 6);
723 : buf[1] = 0x80 | (ucs & 0x3F);
724 : return 2;
725 : } else if (ucs < 0x010000U) {
726 : buf[0] = 0xe0 | (ucs >> 12);
727 : buf[1] = 0x80 | ((ucs >> 6) & 0x3F);
728 : buf[2] = 0x80 | (ucs & 0x3F);
729 : return 3;
730 : } else if (ucs < 0x0010ffffU) {
731 : buf[0] = 0xf0 | (ucs >> 18);
732 : buf[1] = 0x80 | ((ucs >> 12) & 0x3F);
733 : buf[2] = 0x80 | ((ucs >> 6) & 0x3F);
734 : buf[3] = 0x80 | (ucs & 0x3F);
735 : return 4;
736 : } else {
737 : // encode 0xfffd:
738 : buf[0] = 0xefU;
739 : buf[1] = 0xbfU;
740 : buf[2] = 0xbdU;
741 : return 3;
742 : }
743 : }
744 : #endif /* def FUTURE_NEEDS */
745 :
746 : /************************************************************************/
747 : /* utf8towc() */
748 : /************************************************************************/
749 :
750 : /* Convert a UTF-8 sequence into an array of wchar_t. These
751 : are used by some system calls, especially on Windows.
752 :
753 : \a src points at the UTF-8, and \a srclen is the number of bytes to
754 : convert.
755 :
756 : \a dst points at an array to write, and \a dstlen is the number of
757 : locations in this array. At most \a dstlen-1 words will be
758 : written there, plus a 0 terminating word. Thus this function
759 : will never overwrite the buffer and will always return a
760 : zero-terminated string. If \a dstlen is zero then \a dst can be
761 : null and no data is written, but the length is returned.
762 :
763 : The return value is the number of words that \e would be written
764 : to \a dst if it were long enough, not counting the terminating
765 : zero. If the return value is greater or equal to \a dstlen it
766 : indicates truncation, you can then allocate a new array of size
767 : return+1 and call this again.
768 :
769 : Errors in the UTF-8 are converted as though each byte in the
770 : erroneous string is in the Microsoft CP1252 encoding. This allows
771 : ISO-8859-1 text mistakenly identified as UTF-8 to be printed
772 : correctly.
773 :
774 : Notice that sizeof(wchar_t) is 2 on Windows and is 4 on Linux
775 : and most other systems. Where wchar_t is 16 bits, Unicode
776 : characters in the range 0x10000 to 0x10ffff are converted to
777 : "surrogate pairs" which take two words each (this is called UTF-16
778 : encoding). If wchar_t is 32 bits this rather nasty problem is
779 : avoided.
780 : */
781 2933 : static unsigned utf8towc(const char* src, unsigned srclen,
782 : wchar_t* dst, unsigned dstlen)
783 : {
784 2933 : const char* p = src;
785 2933 : const char* e = src+srclen;
786 2933 : unsigned count = 0;
787 15241 : if (dstlen) for (;;) {
788 15241 : if (p >= e) {dst[count] = 0; return count;}
789 12308 : if (!(*p & 0x80)) { // ascii
790 12160 : dst[count] = *p++;
791 : } else {
792 148 : int len; unsigned ucs = utf8decode(p,e,&len);
793 148 : p += len;
794 : #ifdef _WIN32
795 : if (ucs < 0x10000) {
796 : dst[count] = (wchar_t)ucs;
797 : } else {
798 : // make a surrogate pair:
799 : if (count+2 >= dstlen) {dst[count] = 0; count += 2; break;}
800 : dst[count] = (wchar_t)((((ucs-0x10000u)>>10)&0x3ff) | 0xd800);
801 : dst[++count] = (wchar_t)((ucs&0x3ff) | 0xdc00);
802 : }
803 : #else
804 148 : dst[count] = (wchar_t)ucs;
805 : #endif
806 : }
807 12308 : if (++count == dstlen) {dst[count-1] = 0; break;}
808 : }
809 : // we filled dst, measure the rest:
810 0 : while (p < e) {
811 0 : if (!(*p & 0x80)) p++;
812 : else {
813 : #ifdef _WIN32
814 : int len; unsigned ucs = utf8decode(p,e,&len);
815 : p += len;
816 : if (ucs >= 0x10000) ++count;
817 : #else
818 0 : int len; utf8decode(p,e,&len);
819 0 : p += len;
820 : #endif
821 : }
822 0 : ++count;
823 : }
824 0 : return count;
825 : }
826 :
827 : /************************************************************************/
828 : /* utf8toa() */
829 : /************************************************************************/
830 : /* Convert a UTF-8 sequence into an array of 1-byte characters.
831 :
832 : If the UTF-8 decodes to a character greater than 0xff then it is
833 : replaced with '?'.
834 :
835 : Errors in the UTF-8 are converted as individual bytes, same as
836 : utf8decode() does. This allows ISO-8859-1 text mistakenly identified
837 : as UTF-8 to be printed correctly (and possibly CP1512 on Windows).
838 :
839 : \a src points at the UTF-8, and \a srclen is the number of bytes to
840 : convert.
841 :
842 : Up to \a dstlen bytes are written to \a dst, including a null
843 : terminator. The return value is the number of bytes that would be
844 : written, not counting the null terminator. If greater or equal to
845 : \a dstlen then if you malloc a new array of size n+1 you will have
846 : the space needed for the entire string. If \a dstlen is zero then
847 : nothing is written and this call just measures the storage space
848 : needed.
849 : */
850 7804 : static unsigned utf8toa(const char* src, unsigned srclen,
851 : char* dst, unsigned dstlen)
852 : {
853 7804 : const char* p = src;
854 7804 : const char* e = src+srclen;
855 7804 : unsigned count = 0;
856 58894 : if (dstlen) for (;;) {
857 : unsigned char c;
858 58894 : if (p >= e) {dst[count] = 0; return count;}
859 51090 : c = *(unsigned char*)p;
860 51090 : if (c < 0xC2) { // ascii or bad code
861 51077 : dst[count] = c;
862 51077 : p++;
863 : } else {
864 13 : int len; unsigned ucs = utf8decode(p,e,&len);
865 13 : p += len;
866 13 : if (ucs < 0x100) dst[count] = (char)ucs;
867 : else
868 : {
869 4 : if (!bHaveWarned4)
870 : {
871 2 : bHaveWarned4 = TRUE;
872 : CPLError(CE_Warning, CPLE_AppDefined,
873 : "One or several characters couldn't be converted correctly from UTF-8 to ISO-8859-1.\n"
874 2 : "This warning will not be emitted anymore.");
875 : }
876 4 : dst[count] = '?';
877 : }
878 : }
879 51090 : if (++count >= dstlen) {dst[count-1] = 0; break;}
880 : }
881 : // we filled dst, measure the rest:
882 0 : while (p < e) {
883 0 : if (!(*p & 0x80)) p++;
884 : else {
885 : int len;
886 0 : utf8decode(p,e,&len);
887 0 : p += len;
888 : }
889 0 : ++count;
890 : }
891 0 : return count;
892 : }
893 :
894 : /************************************************************************/
895 : /* utf8fromwc() */
896 : /************************************************************************/
897 : /* Turn "wide characters" as returned by some system calls
898 : (especially on Windows) into UTF-8.
899 :
900 : Up to \a dstlen bytes are written to \a dst, including a null
901 : terminator. The return value is the number of bytes that would be
902 : written, not counting the null terminator. If greater or equal to
903 : \a dstlen then if you malloc a new array of size n+1 you will have
904 : the space needed for the entire string. If \a dstlen is zero then
905 : nothing is written and this call just measures the storage space
906 : needed.
907 :
908 : \a srclen is the number of words in \a src to convert. On Windows
909 : this is not necessairly the number of characters, due to there
910 : possibly being "surrogate pairs" in the UTF-16 encoding used.
911 : On Unix wchar_t is 32 bits and each location is a character.
912 :
913 : On Unix if a src word is greater than 0x10ffff then this is an
914 : illegal character according to RFC 3629. These are converted as
915 : though they are 0xFFFD (REPLACEMENT CHARACTER). Characters in the
916 : range 0xd800 to 0xdfff, or ending with 0xfffe or 0xffff are also
917 : illegal according to RFC 3629. However I encode these as though
918 : they are legal, so that utf8towc will return the original data.
919 :
920 : On Windows "surrogate pairs" are converted to a single character
921 : and UTF-8 encoded (as 4 bytes). Mismatched halves of surrogate
922 : pairs are converted as though they are individual characters.
923 : */
924 12605 : static unsigned utf8fromwc(char* dst, unsigned dstlen,
925 : const wchar_t* src, unsigned srclen) {
926 12605 : unsigned i = 0;
927 12605 : unsigned count = 0;
928 4342529 : if (dstlen) for (;;) {
929 : unsigned ucs;
930 4342529 : if (i >= srclen) {dst[count] = 0; return count;}
931 4329924 : ucs = src[i++];
932 4329924 : if (ucs < 0x80U) {
933 4328889 : dst[count++] = (char)ucs;
934 4328889 : if (count >= dstlen) {dst[count-1] = 0; break;}
935 1035 : } else if (ucs < 0x800U) { // 2 bytes
936 1035 : if (count+2 >= dstlen) {dst[count] = 0; count += 2; break;}
937 1035 : dst[count++] = 0xc0 | (char)(ucs >> 6);
938 1035 : dst[count++] = 0x80 | (char)(ucs & 0x3F);
939 : #ifdef _WIN32
940 : } else if (ucs >= 0xd800 && ucs <= 0xdbff && i < srclen &&
941 : src[i] >= 0xdc00 && src[i] <= 0xdfff) {
942 : // surrogate pair
943 : unsigned ucs2 = src[i++];
944 : ucs = 0x10000U + ((ucs&0x3ff)<<10) + (ucs2&0x3ff);
945 : // all surrogate pairs turn into 4-byte utf8
946 : #else
947 0 : } else if (ucs >= 0x10000) {
948 0 : if (ucs > 0x10ffff) {
949 0 : ucs = 0xfffd;
950 0 : goto J1;
951 : }
952 : #endif
953 0 : if (count+4 >= dstlen) {dst[count] = 0; count += 4; break;}
954 0 : dst[count++] = 0xf0 | (char)(ucs >> 18);
955 0 : dst[count++] = 0x80 | (char)((ucs >> 12) & 0x3F);
956 0 : dst[count++] = 0x80 | (char)((ucs >> 6) & 0x3F);
957 0 : dst[count++] = 0x80 | (char)(ucs & 0x3F);
958 : } else {
959 : #ifndef _WIN32
960 : J1:
961 : #endif
962 : // all others are 3 bytes:
963 0 : if (count+3 >= dstlen) {dst[count] = 0; count += 3; break;}
964 0 : dst[count++] = 0xe0 | (char)(ucs >> 12);
965 0 : dst[count++] = 0x80 | (char)((ucs >> 6) & 0x3F);
966 0 : dst[count++] = 0x80 | (char)(ucs & 0x3F);
967 : }
968 : }
969 : // we filled dst, measure the rest:
970 0 : while (i < srclen) {
971 0 : unsigned ucs = src[i++];
972 0 : if (ucs < 0x80U) {
973 0 : count++;
974 0 : } else if (ucs < 0x800U) { // 2 bytes
975 0 : count += 2;
976 : #ifdef _WIN32
977 : } else if (ucs >= 0xd800 && ucs <= 0xdbff && i < srclen-1 &&
978 : src[i+1] >= 0xdc00 && src[i+1] <= 0xdfff) {
979 : // surrogate pair
980 : ++i;
981 : #else
982 0 : } else if (ucs >= 0x10000 && ucs <= 0x10ffff) {
983 : #endif
984 0 : count += 4;
985 : } else {
986 0 : count += 3;
987 : }
988 : }
989 0 : return count;
990 : }
991 :
992 :
993 : /************************************************************************/
994 : /* utf8froma() */
995 : /************************************************************************/
996 :
997 : /* Convert an ISO-8859-1 (ie normal c-string) byte stream to UTF-8.
998 :
999 : It is possible this should convert Microsoft's CP1252 to UTF-8
1000 : instead. This would translate the codes in the range 0x80-0x9f
1001 : to different characters. Currently it does not do this.
1002 :
1003 : Up to \a dstlen bytes are written to \a dst, including a null
1004 : terminator. The return value is the number of bytes that would be
1005 : written, not counting the null terminator. If greater or equal to
1006 : \a dstlen then if you malloc a new array of size n+1 you will have
1007 : the space needed for the entire string. If \a dstlen is zero then
1008 : nothing is written and this call just measures the storage space
1009 : needed.
1010 :
1011 : \a srclen is the number of bytes in \a src to convert.
1012 :
1013 : If the return value equals \a srclen then this indicates that
1014 : no conversion is necessary, as only ASCII characters are in the
1015 : string.
1016 : */
1017 47538 : static unsigned utf8froma(char* dst, unsigned dstlen,
1018 : const char* src, unsigned srclen) {
1019 47538 : const char* p = src;
1020 47538 : const char* e = src+srclen;
1021 47538 : unsigned count = 0;
1022 879458 : if (dstlen) for (;;) {
1023 : unsigned char ucs;
1024 879458 : if (p >= e) {dst[count] = 0; return count;}
1025 831920 : ucs = *(unsigned char*)p++;
1026 831920 : if (ucs < 0x80U) {
1027 831888 : dst[count++] = ucs;
1028 831888 : if (count >= dstlen) {dst[count-1] = 0; break;}
1029 : } else { // 2 bytes (note that CP1252 translate could make 3 bytes!)
1030 32 : if (count+2 >= dstlen) {dst[count] = 0; count += 2; break;}
1031 32 : dst[count++] = 0xc0 | (ucs >> 6);
1032 32 : dst[count++] = 0x80 | (ucs & 0x3F);
1033 : }
1034 : }
1035 : // we filled dst, measure the rest:
1036 0 : while (p < e) {
1037 0 : unsigned char ucs = *(unsigned char*)p++;
1038 0 : if (ucs < 0x80U) {
1039 0 : count++;
1040 : } else {
1041 0 : count += 2;
1042 : }
1043 : }
1044 0 : return count;
1045 : }
1046 :
1047 : #ifdef _WIN32
1048 :
1049 : /************************************************************************/
1050 : /* CPLWin32Recode() */
1051 : /************************************************************************/
1052 :
1053 : /* Convert an CODEPAGE (ie normal c-string) byte stream
1054 : to another CODEPAGE (ie normal c-string) byte stream.
1055 :
1056 : \a src is target c-string byte stream (including a null terminator).
1057 : \a src_code_page is target c-string byte code page.
1058 : \a dst_code_page is destination c-string byte code page.
1059 :
1060 : UTF7 65000
1061 : UTF8 65001
1062 : OEM-US 437
1063 : OEM-ALABIC 720
1064 : OEM-GREEK 737
1065 : OEM-BALTIC 775
1066 : OEM-MLATIN1 850
1067 : OEM-LATIN2 852
1068 : OEM-CYRILLIC 855
1069 : OEM-TURKISH 857
1070 : OEM-MLATIN1P 858
1071 : OEM-HEBREW 862
1072 : OEM-RUSSIAN 866
1073 :
1074 : THAI 874
1075 : SJIS 932
1076 : GBK 936
1077 : KOREA 949
1078 : BIG5 950
1079 :
1080 : EUROPE 1250
1081 : CYRILLIC 1251
1082 : LATIN1 1252
1083 : GREEK 1253
1084 : TURKISH 1254
1085 : HEBREW 1255
1086 : ARABIC 1256
1087 : BALTIC 1257
1088 : VIETNAM 1258
1089 :
1090 : ISO-LATIN1 28591
1091 : ISO-LATIN2 28592
1092 : ISO-LATIN3 28593
1093 : ISO-BALTIC 28594
1094 : ISO-CYRILLIC 28595
1095 : ISO-ARABIC 28596
1096 : ISO-HEBREW 28598
1097 : ISO-TURKISH 28599
1098 : ISO-LATIN9 28605
1099 :
1100 : ISO-2022-JP 50220
1101 :
1102 : */
1103 :
1104 : char* CPLWin32Recode( const char* src, unsigned src_code_page, unsigned dst_code_page )
1105 : {
1106 : /* Convert from source code page to Unicode */
1107 :
1108 : /* Compute the length in wide characters */
1109 : int wlen = MultiByteToWideChar( src_code_page, MB_ERR_INVALID_CHARS, src, -1, 0, 0 );
1110 : if (wlen == 0 && GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
1111 : {
1112 : if (!bHaveWarned5)
1113 : {
1114 : bHaveWarned5 = TRUE;
1115 : CPLError(CE_Warning, CPLE_AppDefined,
1116 : "One or several characters could not be translated from CP%d. "
1117 : "This warning will not be emitted anymore.", src_code_page);
1118 : }
1119 :
1120 : /* Retry now without MB_ERR_INVALID_CHARS flag */
1121 : wlen = MultiByteToWideChar( src_code_page, 0, src, -1, 0, 0 );
1122 : }
1123 :
1124 : /* Do the actual conversion */
1125 : wchar_t* tbuf = (wchar_t*)CPLCalloc(sizeof(wchar_t),wlen+1);
1126 : tbuf[wlen] = 0;
1127 : MultiByteToWideChar( src_code_page, 0, src, -1, tbuf, wlen+1 );
1128 :
1129 : /* Convert from Unicode to destination code page */
1130 :
1131 : /* Compute the length in chars */
1132 : BOOL bUsedDefaultChar = FALSE;
1133 : int len;
1134 : if ( dst_code_page == CP_UTF7 || dst_code_page == CP_UTF8 )
1135 : len = WideCharToMultiByte( dst_code_page, 0, tbuf, -1, 0, 0, 0, NULL );
1136 : else
1137 : len = WideCharToMultiByte( dst_code_page, 0, tbuf, -1, 0, 0, 0, &bUsedDefaultChar );
1138 : if (bUsedDefaultChar)
1139 : {
1140 : if (!bHaveWarned6)
1141 : {
1142 : bHaveWarned6 = TRUE;
1143 : CPLError(CE_Warning, CPLE_AppDefined,
1144 : "One or several characters could not be translated to CP%d. "
1145 : "This warning will not be emitted anymore.", dst_code_page);
1146 : }
1147 : }
1148 :
1149 : /* Do the actual conversion */
1150 : char* pszResult = (char*)CPLCalloc(sizeof(char),len+1);
1151 : WideCharToMultiByte( dst_code_page, 0, tbuf, -1, pszResult, len+1, 0, NULL );
1152 : pszResult[len] = 0;
1153 :
1154 : /* Cleanup */
1155 : CPLFree(tbuf);
1156 :
1157 : return pszResult;
1158 : }
1159 :
1160 : #endif
1161 :
1162 :
1163 :
1164 : /*
1165 : ** For now we disable the rest which is locale() related. We may need
1166 : ** parts of it later.
1167 : */
1168 :
1169 : #ifdef notdef
1170 :
1171 : #ifdef _WIN32
1172 : # include <windows.h>
1173 : #endif
1174 :
1175 : /*! Return true if the "locale" seems to indicate that UTF-8 encoding
1176 : is used. If true the utf8tomb and utf8frommb don't do anything
1177 : useful.
1178 :
1179 : <i>It is highly recommended that you change your system so this
1180 : does return true.</i> On Windows this is done by setting the
1181 : "codepage" to CP_UTF8. On Unix this is done by setting $LC_CTYPE
1182 : to a string containing the letters "utf" or "UTF" in it, or by
1183 : deleting all $LC* and $LANG environment variables. In the future
1184 : it is likely that all non-Asian Unix systems will return true,
1185 : due to the compatability of UTF-8 with ISO-8859-1.
1186 : */
1187 : int utf8locale(void) {
1188 : static int ret = 2;
1189 : if (ret == 2) {
1190 : #ifdef _WIN32
1191 : ret = GetACP() == CP_UTF8;
1192 : #else
1193 : char* s;
1194 : ret = 1; // assumme UTF-8 if no locale
1195 : if (((s = getenv("LC_CTYPE")) && *s) ||
1196 : ((s = getenv("LC_ALL")) && *s) ||
1197 : ((s = getenv("LANG")) && *s)) {
1198 : ret = (strstr(s,"utf") || strstr(s,"UTF"));
1199 : }
1200 : #endif
1201 : }
1202 : return ret;
1203 : }
1204 :
1205 : /*! Convert the UTF-8 used by FLTK to the locale-specific encoding
1206 : used for filenames (and sometimes used for data in files).
1207 : Unfortunatley due to stupid design you will have to do this as
1208 : needed for filenames. This is a bug on both Unix and Windows.
1209 :
1210 : Up to \a dstlen bytes are written to \a dst, including a null
1211 : terminator. The return value is the number of bytes that would be
1212 : written, not counting the null terminator. If greater or equal to
1213 : \a dstlen then if you malloc a new array of size n+1 you will have
1214 : the space needed for the entire string. If \a dstlen is zero then
1215 : nothing is written and this call just measures the storage space
1216 : needed.
1217 :
1218 : If utf8locale() returns true then this does not change the data.
1219 : It is copied and truncated as necessary to
1220 : the destination buffer and \a srclen is always returned. */
1221 : unsigned utf8tomb(const char* src, unsigned srclen,
1222 : char* dst, unsigned dstlen)
1223 : {
1224 : if (!utf8locale()) {
1225 : #ifdef _WIN32
1226 : wchar_t lbuf[1024];
1227 : wchar_t* buf = lbuf;
1228 : unsigned length = utf8towc(src, srclen, buf, 1024);
1229 : unsigned ret;
1230 : if (length >= 1024) {
1231 : buf = (wchar_t*)(malloc((length+1)*sizeof(wchar_t)));
1232 : utf8towc(src, srclen, buf, length+1);
1233 : }
1234 : if (dstlen) {
1235 : // apparently this does not null-terminate, even though msdn
1236 : // documentation claims it does:
1237 : ret =
1238 : WideCharToMultiByte(GetACP(), 0, buf, length, dst, dstlen, 0, 0);
1239 : dst[ret] = 0;
1240 : }
1241 : // if it overflows or measuring length, get the actual length:
1242 : if (dstlen==0 || ret >= dstlen-1)
1243 : ret =
1244 : WideCharToMultiByte(GetACP(), 0, buf, length, 0, 0, 0, 0);
1245 : if (buf != lbuf) free((void*)buf);
1246 : return ret;
1247 : #else
1248 : wchar_t lbuf[1024];
1249 : wchar_t* buf = lbuf;
1250 : unsigned length = utf8towc(src, srclen, buf, 1024);
1251 : int ret;
1252 : if (length >= 1024) {
1253 : buf = (wchar_t*)(malloc((length+1)*sizeof(wchar_t)));
1254 : utf8towc(src, srclen, buf, length+1);
1255 : }
1256 : if (dstlen) {
1257 : ret = wcstombs(dst, buf, dstlen);
1258 : if (ret >= dstlen-1) ret = wcstombs(0,buf,0);
1259 : } else {
1260 : ret = wcstombs(0,buf,0);
1261 : }
1262 : if (buf != lbuf) free((void*)buf);
1263 : if (ret >= 0) return (unsigned)ret;
1264 : // on any errors we return the UTF-8 as raw text...
1265 : #endif
1266 : }
1267 : // identity transform:
1268 : if (srclen < dstlen) {
1269 : memcpy(dst, src, srclen);
1270 : dst[srclen] = 0;
1271 : } else {
1272 : memcpy(dst, src, dstlen-1);
1273 : dst[dstlen-1] = 0;
1274 : }
1275 : return srclen;
1276 : }
1277 :
1278 : /*! Convert a filename from the locale-specific multibyte encoding
1279 : used by Windows to UTF-8 as used by FLTK.
1280 :
1281 : Up to \a dstlen bytes are written to \a dst, including a null
1282 : terminator. The return value is the number of bytes that would be
1283 : written, not counting the null terminator. If greater or equal to
1284 : \a dstlen then if you malloc a new array of size n+1 you will have
1285 : the space needed for the entire string. If \a dstlen is zero then
1286 : nothing is written and this call just measures the storage space
1287 : needed.
1288 :
1289 : On Unix or on Windows when a UTF-8 locale is in effect, this
1290 : does not change the data. It is copied and truncated as necessary to
1291 : the destination buffer and \a srclen is always returned.
1292 : You may also want to check if utf8test() returns non-zero, so that
1293 : the filesystem can store filenames in UTF-8 encoding regardless of
1294 : the locale.
1295 : */
1296 : unsigned utf8frommb(char* dst, unsigned dstlen,
1297 : const char* src, unsigned srclen)
1298 : {
1299 : if (!utf8locale()) {
1300 : #ifdef _WIN32
1301 : wchar_t lbuf[1024];
1302 : wchar_t* buf = lbuf;
1303 : unsigned length;
1304 : unsigned ret;
1305 : length =
1306 : MultiByteToWideChar(GetACP(), 0, src, srclen, buf, 1024);
1307 : if (length >= 1024) {
1308 : length = MultiByteToWideChar(GetACP(), 0, src, srclen, 0, 0);
1309 : buf = (wchar_t*)(malloc(length*sizeof(wchar_t)));
1310 : MultiByteToWideChar(GetACP(), 0, src, srclen, buf, length);
1311 : }
1312 : ret = utf8fromwc(dst, dstlen, buf, length);
1313 : if (buf != lbuf) free((void*)buf);
1314 : return ret;
1315 : #else
1316 : wchar_t lbuf[1024];
1317 : wchar_t* buf = lbuf;
1318 : int length;
1319 : unsigned ret;
1320 : length = mbstowcs(buf, src, 1024);
1321 : if (length >= 1024) {
1322 : length = mbstowcs(0, src, 0)+1;
1323 : buf = (wchar_t*)(malloc(length*sizeof(unsigned short)));
1324 : mbstowcs(buf, src, length);
1325 : }
1326 : if (length >= 0) {
1327 : ret = utf8fromwc(dst, dstlen, buf, length);
1328 : if (buf != lbuf) free((void*)buf);
1329 : return ret;
1330 : }
1331 : // errors in conversion return the UTF-8 unchanged
1332 : #endif
1333 : }
1334 : // identity transform:
1335 : if (srclen < dstlen) {
1336 : memcpy(dst, src, srclen);
1337 : dst[srclen] = 0;
1338 : } else {
1339 : memcpy(dst, src, dstlen-1);
1340 : dst[dstlen-1] = 0;
1341 : }
1342 : return srclen;
1343 : }
1344 :
1345 : #endif /* def notdef - disabled locale specific stuff */
1346 :
1347 : /*! Examines the first \a srclen bytes in \a src and return a verdict
1348 : on whether it is UTF-8 or not.
1349 : - Returns 0 if there is any illegal UTF-8 sequences, using the
1350 : same rules as utf8decode(). Note that some UCS values considered
1351 : illegal by RFC 3629, such as 0xffff, are considered legal by this.
1352 : - Returns 1 if there are only single-byte characters (ie no bytes
1353 : have the high bit set). This is legal UTF-8, but also indicates
1354 : plain ASCII. It also returns 1 if \a srclen is zero.
1355 : - Returns 2 if there are only characters less than 0x800.
1356 : - Returns 3 if there are only characters less than 0x10000.
1357 : - Returns 4 if there are characters in the 0x10000 to 0x10ffff range.
1358 :
1359 : Because there are many illegal sequences in UTF-8, it is almost
1360 : impossible for a string in another encoding to be confused with
1361 : UTF-8. This is very useful for transitioning Unix to UTF-8
1362 : filenames, you can simply test each filename with this to decide
1363 : if it is UTF-8 or in the locale encoding. My hope is that if
1364 : this is done we will be able to cleanly transition to a locale-less
1365 : encoding.
1366 : */
1367 :
1368 476054 : static int utf8test(const char* src, unsigned srclen) {
1369 476054 : int ret = 1;
1370 476054 : const char* p = src;
1371 476054 : const char* e = src+srclen;
1372 3727574 : while (p < e) {
1373 2775467 : if (*p & 0x80) {
1374 82 : int len; utf8decode(p,e,&len);
1375 82 : if (len < 2) return 0;
1376 81 : if (len > ret) ret = len;
1377 81 : p += len;
1378 : } else {
1379 2775385 : p++;
1380 : }
1381 : }
1382 476053 : return ret;
1383 : }
1384 :
1385 : #endif /* defined(CPL_RECODE_STUB) */
|