1 : /**********************************************************************
2 : * $Id: cpl_recode.cpp 24555 2012-06-10 09:49:55Z rouault $
3 : *
4 : * Name: cpl_recode.cpp
5 : * Project: CPL - Common Portability Library
6 : * Purpose: Character set recoding and char/wchar_t conversions.
7 : * Author: Andrey Kiselev, dron@ak4719.spb.edu
8 : *
9 : **********************************************************************
10 : * Copyright (c) 2011, Andrey Kiselev <dron@ak4719.spb.edu>
11 : * Copyright (c) 2008, Frank Warmerdam
12 : *
13 : * Permission to use, copy, modify, and distribute this software for any
14 : * purpose with or without fee is hereby granted, provided that the above
15 : * copyright notice and this permission notice appear in all copies.
16 : *
17 : * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
18 : * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
19 : * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
20 : * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
21 : * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
22 : * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
23 : * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
24 : **********************************************************************/
25 :
26 : #include "cpl_string.h"
27 :
28 : CPL_CVSID("$Id: cpl_recode.cpp 24555 2012-06-10 09:49:55Z rouault $");
29 :
30 : #ifdef CPL_RECODE_ICONV
31 : extern void CPLClearRecodeIconvWarningFlags();
32 : extern char *CPLRecodeIconv( const char *, const char *, const char * );
33 : extern char *CPLRecodeFromWCharIconv( const wchar_t *,
34 : const char *, const char * );
35 : extern wchar_t *CPLRecodeToWCharIconv( const char *,
36 : const char *, const char * );
37 : #endif /* CPL_RECODE_ICONV */
38 :
39 : extern void CPLClearRecodeStubWarningFlags();
40 : extern char *CPLRecodeStub( const char *, const char *, const char * );
41 : extern char *CPLRecodeFromWCharStub( const wchar_t *,
42 : const char *, const char * );
43 : extern wchar_t *CPLRecodeToWCharStub( const char *,
44 : const char *, const char * );
45 : extern int CPLIsUTF8Stub( const char *, int );
46 :
47 : /************************************************************************/
48 : /* CPLRecode() */
49 : /************************************************************************/
50 :
51 : /**
52 : * Convert a string from a source encoding to a destination encoding.
53 : *
54 : * The only guaranteed supported encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
55 : * and CPL_ENC_ISO8859_1. Currently, the following conversions are supported :
56 : * <ul>
57 : * <li>CPL_ENC_ASCII -> CPL_ENC_UTF8 or CPL_ENC_ISO8859_1 (no conversion in fact)</li>
58 : * <li>CPL_ENC_ISO8859_1 -> CPL_ENC_UTF8</li>
59 : * <li>CPL_ENC_UTF8 -> CPL_ENC_ISO8859_1</li>
60 : * </ul>
61 : *
62 : * If an error occurs an error may, or may not be posted with CPLError().
63 : *
64 : * @param pszSource a NULL terminated string.
65 : * @param pszSrcEncoding the source encoding.
66 : * @param pszDstEncoding the destination encoding.
67 : *
68 : * @return a NULL terminated string which should be freed with CPLFree().
69 : *
70 : * @since GDAL 1.6.0
71 : */
72 :
73 55377 : char CPL_DLL *CPLRecode( const char *pszSource,
74 : const char *pszSrcEncoding,
75 : const char *pszDstEncoding )
76 :
77 : {
78 : /* -------------------------------------------------------------------- */
79 : /* Handle a few common short cuts. */
80 : /* -------------------------------------------------------------------- */
81 55377 : if ( EQUAL(pszSrcEncoding, pszDstEncoding) )
82 0 : return CPLStrdup(pszSource);
83 :
84 55377 : if ( EQUAL(pszSrcEncoding, CPL_ENC_ASCII)
85 : && ( EQUAL(pszDstEncoding, CPL_ENC_UTF8)
86 : || EQUAL(pszDstEncoding, CPL_ENC_ISO8859_1) ) )
87 0 : return CPLStrdup(pszSource);
88 :
89 : #ifdef CPL_RECODE_ICONV
90 : /* -------------------------------------------------------------------- */
91 : /* CPL_ENC_ISO8859_1 -> CPL_ENC_UTF8 */
92 : /* and CPL_ENC_UTF8 -> CPL_ENC_ISO8859_1 conversions are hadled */
93 : /* very well by the stub implementation which is faster than the */
94 : /* iconv() route. Use a stub for these two ones and iconv() */
95 : /* everything else. */
96 : /* -------------------------------------------------------------------- */
97 55377 : if ( ( EQUAL(pszSrcEncoding, CPL_ENC_ISO8859_1)
98 : && EQUAL(pszDstEncoding, CPL_ENC_UTF8) )
99 : || ( EQUAL(pszSrcEncoding, CPL_ENC_UTF8)
100 : && EQUAL(pszDstEncoding, CPL_ENC_ISO8859_1) ) )
101 : {
102 55342 : return CPLRecodeStub( pszSource, pszSrcEncoding, pszDstEncoding );
103 : }
104 : else
105 : {
106 35 : return CPLRecodeIconv( pszSource, pszSrcEncoding, pszDstEncoding );
107 : }
108 : #else /* CPL_RECODE_STUB */
109 : return CPLRecodeStub( pszSource, pszSrcEncoding, pszDstEncoding );
110 : #endif /* CPL_RECODE_ICONV */
111 : }
112 :
113 : /************************************************************************/
114 : /* CPLRecodeFromWChar() */
115 : /************************************************************************/
116 :
117 : /**
118 : * Convert wchar_t string to UTF-8.
119 : *
120 : * Convert a wchar_t string into a multibyte utf-8 string. The only
121 : * guaranteed supported source encoding is CPL_ENC_UCS2, and the only
122 : * guaranteed supported destination encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
123 : * and CPL_ENC_ISO8859_1. In some cases (ie. using iconv()) other encodings
124 : * may also be supported.
125 : *
126 : * Note that the wchar_t type varies in size on different systems. On
127 : * win32 it is normally 2 bytes, and on unix 4 bytes.
128 : *
129 : * If an error occurs an error may, or may not be posted with CPLError().
130 : *
131 : * @param pwszSource the source wchar_t string, terminated with a 0 wchar_t.
132 : * @param pszSrcEncoding the source encoding, typically CPL_ENC_UCS2.
133 : * @param pszDstEncoding the destination encoding, typically CPL_ENC_UTF8.
134 : *
135 : * @return a zero terminated multi-byte string which should be freed with
136 : * CPLFree(), or NULL if an error occurs.
137 : *
138 : * @since GDAL 1.6.0
139 : */
140 :
141 12605 : char CPL_DLL *CPLRecodeFromWChar( const wchar_t *pwszSource,
142 : const char *pszSrcEncoding,
143 : const char *pszDstEncoding )
144 :
145 : {
146 : #ifdef CPL_RECODE_ICONV
147 : /* -------------------------------------------------------------------- */
148 : /* Conversions from CPL_ENC_UCS2 */
149 : /* to CPL_ENC_UTF8, CPL_ENC_ISO8859_1 and CPL_ENC_ASCII are well */
150 : /* handled by the stub implementation. */
151 : /* -------------------------------------------------------------------- */
152 12605 : if ( (EQUAL(pszSrcEncoding, CPL_ENC_UCS2) || EQUAL(pszSrcEncoding, "WCHAR_T"))
153 : && ( EQUAL(pszDstEncoding, CPL_ENC_UTF8)
154 : || EQUAL(pszDstEncoding, CPL_ENC_ASCII)
155 : || EQUAL(pszDstEncoding, CPL_ENC_ISO8859_1) ) )
156 : {
157 : return CPLRecodeFromWCharStub( pwszSource,
158 12605 : pszSrcEncoding, pszDstEncoding );
159 : }
160 : else
161 : {
162 : return CPLRecodeFromWCharIconv( pwszSource,
163 0 : pszSrcEncoding, pszDstEncoding );
164 : }
165 : #else /* CPL_RECODE_STUB */
166 : return CPLRecodeFromWCharStub( pwszSource,
167 : pszSrcEncoding, pszDstEncoding );
168 : #endif /* CPL_RECODE_ICONV */
169 : }
170 :
171 : /************************************************************************/
172 : /* CPLRecodeToWChar() */
173 : /************************************************************************/
174 :
175 : /**
176 : * Convert UTF-8 string to a wchar_t string.
177 : *
178 : * Convert a 8bit, multi-byte per character input string into a wide
179 : * character (wchar_t) string. The only guaranteed supported source encodings
180 : * are CPL_ENC_UTF8, CPL_ENC_ASCII and CPL_ENC_ISO8869_1 (LATIN1). The only
181 : * guaranteed supported destination encoding is CPL_ENC_UCS2. Other source
182 : * and destination encodings may be supported depending on the underlying
183 : * implementation.
184 : *
185 : * Note that the wchar_t type varies in size on different systems. On
186 : * win32 it is normally 2 bytes, and on unix 4 bytes.
187 : *
188 : * If an error occurs an error may, or may not be posted with CPLError().
189 : *
190 : * @param pszSource input multi-byte character string.
191 : * @param pszSrcEncoding source encoding, typically CPL_ENC_UTF8.
192 : * @param pszDstEncoding destination encoding, typically CPL_ENC_UCS2.
193 : *
194 : * @return the zero terminated wchar_t string (to be freed with CPLFree()) or
195 : * NULL on error.
196 : *
197 : * @since GDAL 1.6.0
198 : */
199 :
200 2933 : wchar_t CPL_DLL *CPLRecodeToWChar( const char *pszSource,
201 : const char *pszSrcEncoding,
202 : const char *pszDstEncoding )
203 :
204 : {
205 : #ifdef CPL_RECODE_ICONV
206 : /* -------------------------------------------------------------------- */
207 : /* Conversions to CPL_ENC_UCS2 */
208 : /* from CPL_ENC_UTF8, CPL_ENC_ISO8859_1 and CPL_ENC_ASCII are well */
209 : /* handled by the stub implementation. */
210 : /* -------------------------------------------------------------------- */
211 2933 : if ( (EQUAL(pszDstEncoding, CPL_ENC_UCS2) || EQUAL(pszDstEncoding, "WCHAR_T"))
212 : && ( EQUAL(pszSrcEncoding, CPL_ENC_UTF8)
213 : || EQUAL(pszSrcEncoding, CPL_ENC_ASCII)
214 : || EQUAL(pszSrcEncoding, CPL_ENC_ISO8859_1) ) )
215 : {
216 : return CPLRecodeToWCharStub( pszSource,
217 2933 : pszSrcEncoding, pszDstEncoding );
218 : }
219 : else
220 : {
221 : return CPLRecodeToWCharIconv( pszSource,
222 0 : pszSrcEncoding, pszDstEncoding );
223 : }
224 : #else /* CPL_RECODE_STUB */
225 : return CPLRecodeToWCharStub( pszSource, pszSrcEncoding, pszDstEncoding );
226 : #endif /* CPL_RECODE_ICONV */
227 : }
228 :
229 : /************************************************************************/
230 : /* CPLIsUTF8() */
231 : /************************************************************************/
232 :
233 : /**
234 : * Test if a string is encoded as UTF-8.
235 : *
236 : * @param pabyData input string to test
237 : * @param nLen length of the input string, or -1 if the function must compute
238 : * the string length. In which case it must be null terminated.
239 : * @return TRUE if the string is encoded as UTF-8. FALSE otherwise
240 : *
241 : * @since GDAL 1.7.0
242 : */
243 476054 : int CPLIsUTF8(const char* pabyData, int nLen)
244 : {
245 476054 : return CPLIsUTF8Stub( pabyData, nLen );
246 : }
247 :
248 : /************************************************************************/
249 : /* CPLForceToASCII() */
250 : /************************************************************************/
251 :
252 : /**
253 : * Return a new string that is made only of ASCII characters. If non-ASCII
254 : * characters are found in the input string, they will be replaced by the
255 : * provided replacement character.
256 : *
257 : * @param pabyData input string to test
258 : * @param nLen length of the input string, or -1 if the function must compute
259 : * the string length. In which case it must be null terminated.
260 : * @param chReplacementChar character which will be used when the input stream
261 : * contains a non ASCII character. Must be valid ASCII !
262 : *
263 : * @return a new string that must be freed with CPLFree().
264 : *
265 : * @since GDAL 1.7.0
266 : */
267 1 : char CPL_DLL *CPLForceToASCII(const char* pabyData, int nLen, char chReplacementChar)
268 : {
269 1 : if (nLen < 0)
270 1 : nLen = strlen(pabyData);
271 1 : char* pszOutputString = (char*)CPLMalloc(nLen + 1);
272 : int i;
273 5 : for(i=0;i<nLen;i++)
274 : {
275 4 : if (((unsigned char*)pabyData)[i] > 127)
276 1 : pszOutputString[i] = chReplacementChar;
277 : else
278 3 : pszOutputString[i] = pabyData[i];
279 : }
280 1 : pszOutputString[i] = '\0';
281 1 : return pszOutputString;
282 : }
283 :
284 : /************************************************************************/
285 : /* CPLEncodingCharSize() */
286 : /************************************************************************/
287 :
288 : /**
289 : * Return bytes per character for encoding.
290 : *
291 : * This function returns the size in bytes of the smallest character
292 : * in this encoding. For fixed width encodings (ASCII, UCS-2, UCS-4) this
293 : * is straight forward. For encodings like UTF8 and UTF16 which represent
294 : * some characters as a sequence of atomic character sizes the function
295 : * still returns the atomic character size (1 for UTF8, 2 for UTF16).
296 : *
297 : * This function will return the correct value for well known encodings
298 : * with corresponding CPL_ENC_ values. It may not return the correct value
299 : * for other encodings even if they are supported by the underlying iconv
300 : * or windows transliteration services. Hopefully it will improve over time.
301 : *
302 : * @param pszEncoding the name of the encoding.
303 : *
304 : * @return the size of a minimal character in bytes or -1 if the size is
305 : * unknown.
306 : */
307 :
308 0 : int CPLEncodingCharSize( const char *pszEncoding )
309 :
310 : {
311 0 : if( EQUAL(pszEncoding,CPL_ENC_UTF8) )
312 0 : return 1;
313 0 : else if( EQUAL(pszEncoding,CPL_ENC_UTF16) )
314 0 : return 2;
315 0 : else if( EQUAL(pszEncoding,CPL_ENC_UCS2) )
316 0 : return 2;
317 0 : else if( EQUAL(pszEncoding,CPL_ENC_UCS4) )
318 0 : return 4;
319 0 : else if( EQUAL(pszEncoding,CPL_ENC_ASCII) )
320 0 : return 1;
321 0 : else if( EQUALN(pszEncoding,"ISO-8859-",9) )
322 0 : return 1;
323 : else
324 0 : return -1;
325 : }
326 :
327 : /************************************************************************/
328 : /* CPLClearRecodeWarningFlags() */
329 : /************************************************************************/
330 :
331 6786 : void CPLClearRecodeWarningFlags()
332 : {
333 : #ifdef CPL_RECODE_ICONV
334 6786 : CPLClearRecodeIconvWarningFlags();
335 : #endif
336 6786 : CPLClearRecodeStubWarningFlags();
337 6786 : }
|