1 : /**********************************************************************
2 : * $Id: cpl_recode.cpp 22600 2011-06-28 13:36:36Z warmerdam $
3 : *
4 : * Name: cpl_recode.cpp
5 : * Project: CPL - Common Portability Library
6 : * Purpose: Character set recoding and char/wchar_t conversions.
7 : * Author: Andrey Kiselev, dron@ak4719.spb.edu
8 : *
9 : **********************************************************************
10 : * Copyright (c) 2011, Andrey Kiselev <dron@ak4719.spb.edu>
11 : * Copyright (c) 2008, Frank Warmerdam
12 : *
13 : * Permission to use, copy, modify, and distribute this software for any
14 : * purpose with or without fee is hereby granted, provided that the above
15 : * copyright notice and this permission notice appear in all copies.
16 : *
17 : * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
18 : * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
19 : * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
20 : * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
21 : * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
22 : * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
23 : * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
24 : **********************************************************************/
25 :
26 : #include "cpl_string.h"
27 :
28 : CPL_CVSID("$Id: cpl_recode.cpp 22600 2011-06-28 13:36:36Z warmerdam $");
29 :
30 : #ifdef CPL_RECODE_ICONV
31 : extern char *CPLRecodeIconv( const char *, const char *, const char * );
32 : extern char *CPLRecodeFromWCharIconv( const wchar_t *,
33 : const char *, const char * );
34 : extern wchar_t *CPLRecodeToWCharIconv( const char *,
35 : const char *, const char * );
36 : #endif /* CPL_RECODE_ICONV */
37 :
38 : extern char *CPLRecodeStub( const char *, const char *, const char * );
39 : extern char *CPLRecodeFromWCharStub( const wchar_t *,
40 : const char *, const char * );
41 : extern wchar_t *CPLRecodeToWCharStub( const char *,
42 : const char *, const char * );
43 : extern int CPLIsUTF8Stub( const char *, int );
44 :
45 : /************************************************************************/
46 : /* CPLRecode() */
47 : /************************************************************************/
48 :
49 : /**
50 : * Convert a string from a source encoding to a destination encoding.
51 : *
52 : * The only guaranteed supported encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
53 : * and CPL_ENC_ISO8859_1. Currently, the following conversions are supported :
54 : * <ul>
55 : * <li>CPL_ENC_ASCII -> CPL_ENC_UTF8 or CPL_ENC_ISO8859_1 (no conversion in fact)</li>
56 : * <li>CPL_ENC_ISO8859_1 -> CPL_ENC_UTF8</li>
57 : * <li>CPL_ENC_UTF8 -> CPL_ENC_ISO8859_1</li>
58 : * </ul>
59 : *
60 : * If an error occurs an error may, or may not be posted with CPLError().
61 : *
62 : * @param pszSource a NULL terminated string.
63 : * @param pszSrcEncoding the source encoding.
64 : * @param pszDstEncoding the destination encoding.
65 : *
66 : * @return a NULL terminated string which should be freed with CPLFree().
67 : *
68 : * @since GDAL 1.6.0
69 : */
70 :
71 18630 : char CPL_DLL *CPLRecode( const char *pszSource,
72 : const char *pszSrcEncoding,
73 : const char *pszDstEncoding )
74 :
75 : {
76 : /* -------------------------------------------------------------------- */
77 : /* Handle a few common short cuts. */
78 : /* -------------------------------------------------------------------- */
79 18630 : if ( EQUAL(pszSrcEncoding, pszDstEncoding) )
80 0 : return CPLStrdup(pszSource);
81 :
82 18630 : if ( EQUAL(pszSrcEncoding, CPL_ENC_ASCII)
83 : && ( EQUAL(pszDstEncoding, CPL_ENC_UTF8)
84 : || EQUAL(pszDstEncoding, CPL_ENC_ISO8859_1) ) )
85 0 : return CPLStrdup(pszSource);
86 :
87 : #ifdef CPL_RECODE_ICONV
88 : /* -------------------------------------------------------------------- */
89 : /* CPL_ENC_ISO8859_1 -> CPL_ENC_UTF8 */
90 : /* and CPL_ENC_UTF8 -> CPL_ENC_ISO8859_1 conversions are hadled */
91 : /* very well by the stub implementation which is faster than the */
92 : /* iconv() route. Use a stub for these two ones and iconv() */
93 : /* everything else. */
94 : /* -------------------------------------------------------------------- */
95 18630 : if ( ( EQUAL(pszSrcEncoding, CPL_ENC_ISO8859_1)
96 : && EQUAL(pszDstEncoding, CPL_ENC_UTF8) )
97 : || ( EQUAL(pszSrcEncoding, CPL_ENC_UTF8)
98 : && EQUAL(pszDstEncoding, CPL_ENC_ISO8859_1) ) )
99 : {
100 17986 : return CPLRecodeStub( pszSource, pszSrcEncoding, pszDstEncoding );
101 : }
102 : else
103 : {
104 644 : return CPLRecodeIconv( pszSource, pszSrcEncoding, pszDstEncoding );
105 : }
106 : #else /* CPL_RECODE_STUB */
107 : return CPLRecodeStub( pszSource, pszSrcEncoding, pszDstEncoding );
108 : #endif /* CPL_RECODE_ICONV */
109 : }
110 :
111 : /************************************************************************/
112 : /* CPLRecodeFromWChar() */
113 : /************************************************************************/
114 :
115 : /**
116 : * Convert wchar_t string to UTF-8.
117 : *
118 : * Convert a wchar_t string into a multibyte utf-8 string. The only
119 : * guaranteed supported source encoding is CPL_ENC_UCS2, and the only
120 : * guaranteed supported destination encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
121 : * and CPL_ENC_ISO8859_1. In some cases (ie. using iconv()) other encodings
122 : * may also be supported.
123 : *
124 : * Note that the wchar_t type varies in size on different systems. On
125 : * win32 it is normally 2 bytes, and on unix 4 bytes.
126 : *
127 : * If an error occurs an error may, or may not be posted with CPLError().
128 : *
129 : * @param pwszSource the source wchar_t string, terminated with a 0 wchar_t.
130 : * @param pszSrcEncoding the source encoding, typically CPL_ENC_UCS2.
131 : * @param pszDstEncoding the destination encoding, typically CPL_ENC_UTF8.
132 : *
133 : * @return a zero terminated multi-byte string which should be freed with
134 : * CPLFree(), or NULL if an error occurs.
135 : *
136 : * @since GDAL 1.6.0
137 : */
138 :
139 2248 : char CPL_DLL *CPLRecodeFromWChar( const wchar_t *pwszSource,
140 : const char *pszSrcEncoding,
141 : const char *pszDstEncoding )
142 :
143 : {
144 : #ifdef CPL_RECODE_ICONV
145 : /* -------------------------------------------------------------------- */
146 : /* Conversions from CPL_ENC_UCS2 */
147 : /* to CPL_ENC_UTF8, CPL_ENC_ISO8859_1 and CPL_ENC_ASCII are well */
148 : /* handled by the stub implementation. */
149 : /* -------------------------------------------------------------------- */
150 2248 : if ( (EQUAL(pszSrcEncoding, CPL_ENC_UCS2) || EQUAL(pszSrcEncoding, "WCHAR_T"))
151 : && ( EQUAL(pszDstEncoding, CPL_ENC_UTF8)
152 : || EQUAL(pszDstEncoding, CPL_ENC_ASCII)
153 : || EQUAL(pszDstEncoding, CPL_ENC_ISO8859_1) ) )
154 : {
155 : return CPLRecodeFromWCharStub( pwszSource,
156 2248 : pszSrcEncoding, pszDstEncoding );
157 : }
158 : else
159 : {
160 : return CPLRecodeFromWCharIconv( pwszSource,
161 0 : pszSrcEncoding, pszDstEncoding );
162 : }
163 : #else /* CPL_RECODE_STUB */
164 : return CPLRecodeFromWCharStub( pwszSource,
165 : pszSrcEncoding, pszDstEncoding );
166 : #endif /* CPL_RECODE_ICONV */
167 : }
168 :
169 : /************************************************************************/
170 : /* CPLRecodeToWChar() */
171 : /************************************************************************/
172 :
173 : /**
174 : * Convert UTF-8 string to a wchar_t string.
175 : *
176 : * Convert a 8bit, multi-byte per character input string into a wide
177 : * character (wchar_t) string. The only guaranteed supported source encodings
178 : * are CPL_ENC_UTF8, CPL_ENC_ASCII and CPL_ENC_ISO8869_1 (LATIN1). The only
179 : * guaranteed supported destination encoding is CPL_ENC_UCS2. Other source
180 : * and destination encodings may be supported depending on the underlying
181 : * implementation.
182 : *
183 : * Note that the wchar_t type varies in size on different systems. On
184 : * win32 it is normally 2 bytes, and on unix 4 bytes.
185 : *
186 : * If an error occurs an error may, or may not be posted with CPLError().
187 : *
188 : * @param pszSource input multi-byte character string.
189 : * @param pszSrcEncoding source encoding, typically CPL_ENC_UTF8.
190 : * @param pszDstEncoding destination encoding, typically CPL_ENC_UCS2.
191 : *
192 : * @return the zero terminated wchar_t string (to be freed with CPLFree()) or
193 : * NULL on error.
194 : *
195 : * @since GDAL 1.6.0
196 : */
197 :
198 5140 : wchar_t CPL_DLL *CPLRecodeToWChar( const char *pszSource,
199 : const char *pszSrcEncoding,
200 : const char *pszDstEncoding )
201 :
202 : {
203 : #ifdef CPL_RECODE_ICONV
204 : /* -------------------------------------------------------------------- */
205 : /* Conversions to CPL_ENC_UCS2 */
206 : /* from CPL_ENC_UTF8, CPL_ENC_ISO8859_1 and CPL_ENC_ASCII are well */
207 : /* handled by the stub implementation. */
208 : /* -------------------------------------------------------------------- */
209 5140 : if ( (EQUAL(pszDstEncoding, CPL_ENC_UCS2) || EQUAL(pszDstEncoding, "WCHAR_T"))
210 : && ( EQUAL(pszSrcEncoding, CPL_ENC_UTF8)
211 : || EQUAL(pszSrcEncoding, CPL_ENC_ASCII)
212 : || EQUAL(pszSrcEncoding, CPL_ENC_ISO8859_1) ) )
213 : {
214 : return CPLRecodeToWCharStub( pszSource,
215 5140 : pszSrcEncoding, pszDstEncoding );
216 : }
217 : else
218 : {
219 : return CPLRecodeToWCharIconv( pszSource,
220 0 : pszSrcEncoding, pszDstEncoding );
221 : }
222 : #else /* CPL_RECODE_STUB */
223 : return CPLRecodeToWCharStub( pszSource, pszSrcEncoding, pszDstEncoding );
224 : #endif /* CPL_RECODE_ICONV */
225 : }
226 :
227 : /************************************************************************/
228 : /* CPLIsUTF8() */
229 : /************************************************************************/
230 :
231 : /**
232 : * Test if a string is encoded as UTF-8.
233 : *
234 : * @param pabyData input string to test
235 : * @param nLen length of the input string, or -1 if the function must compute
236 : * the string length. In which case it must be null terminated.
237 : * @return TRUE if the string is encoded as UTF-8. FALSE otherwise
238 : *
239 : * @since GDAL 1.7.0
240 : */
241 951570 : int CPLIsUTF8(const char* pabyData, int nLen)
242 : {
243 951570 : return CPLIsUTF8Stub( pabyData, nLen );
244 : }
245 :
246 : /************************************************************************/
247 : /* CPLForceToASCII() */
248 : /************************************************************************/
249 :
250 : /**
251 : * Return a new string that is made only of ASCII characters. If non-ASCII
252 : * characters are found in the input string, they will be replaced by the
253 : * provided replacement character.
254 : *
255 : * @param pabyData input string to test
256 : * @param nLen length of the input string, or -1 if the function must compute
257 : * the string length. In which case it must be null terminated.
258 : * @param chReplacementChar character which will be used when the input stream
259 : * contains a non ASCII character. Must be valid ASCII !
260 : *
261 : * @return a new string that must be freed with CPLFree().
262 : *
263 : * @since GDAL 1.7.0
264 : */
265 2 : char CPL_DLL *CPLForceToASCII(const char* pabyData, int nLen, char chReplacementChar)
266 : {
267 2 : if (nLen < 0)
268 2 : nLen = strlen(pabyData);
269 2 : char* pszOutputString = (char*)CPLMalloc(nLen + 1);
270 : int i;
271 10 : for(i=0;i<nLen;i++)
272 : {
273 8 : if (((unsigned char*)pabyData)[i] > 127)
274 2 : pszOutputString[i] = chReplacementChar;
275 : else
276 6 : pszOutputString[i] = pabyData[i];
277 : }
278 2 : pszOutputString[i] = '\0';
279 2 : return pszOutputString;
280 : }
281 :
282 : /************************************************************************/
283 : /* CPLEncodingCharSize() */
284 : /************************************************************************/
285 :
286 : /**
287 : * Return bytes per character for encoding.
288 : *
289 : * This function returns the size in bytes of the smallest character
290 : * in this encoding. For fixed width encodings (ASCII, UCS-2, UCS-4) this
291 : * is straight forward. For encodings like UTF8 and UTF16 which represent
292 : * some characters as a sequence of atomic character sizes the function
293 : * still returns the atomic character size (1 for UTF8, 2 for UTF16).
294 : *
295 : * This function will return the correct value for well known encodings
296 : * with corresponding CPL_ENC_ values. It may not return the correct value
297 : * for other encodings even if they are supported by the underlying iconv
298 : * or windows transliteration services. Hopefully it will improve over time.
299 : *
300 : * @param pszEncoding the name of the encoding.
301 : *
302 : * @return the size of a minimal character in bytes or -1 if the size is
303 : * unknown.
304 : */
305 :
306 0 : int CPLEncodingCharSize( const char *pszEncoding )
307 :
308 : {
309 0 : if( EQUAL(pszEncoding,CPL_ENC_UTF8) )
310 0 : return 1;
311 0 : else if( EQUAL(pszEncoding,CPL_ENC_UTF16) )
312 0 : return 2;
313 0 : else if( EQUAL(pszEncoding,CPL_ENC_UCS2) )
314 0 : return 2;
315 0 : else if( EQUAL(pszEncoding,CPL_ENC_UCS4) )
316 0 : return 4;
317 0 : else if( EQUAL(pszEncoding,CPL_ENC_ASCII) )
318 0 : return 1;
319 0 : else if( EQUALN(pszEncoding,"ISO-8859-",9) )
320 0 : return 1;
321 : else
322 0 : return -1;
323 : }
324 :
|