1 : /**********************************************************************
2 : * $Id: cpl_recode_iconv.cpp 24555 2012-06-10 09:49:55Z rouault $
3 : *
4 : * Name: cpl_recode_iconv.cpp
5 : * Project: CPL - Common Portability Library
6 : * Purpose: Character set recoding and char/wchar_t conversions implemented
7 : * using the iconv() functionality.
8 : * Author: Andrey Kiselev, dron@ak4719.spb.edu
9 : *
10 : **********************************************************************
11 : * Copyright (c) 2011, Andrey Kiselev <dron@ak4719.spb.edu>
12 : *
13 : * Permission to use, copy, modify, and distribute this software for any
14 : * purpose with or without fee is hereby granted, provided that the above
15 : * copyright notice and this permission notice appear in all copies.
16 : *
17 : * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
18 : * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
19 : * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
20 : * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
21 : * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
22 : * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
23 : * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
24 : **********************************************************************/
25 :
26 : #include "cpl_port.h"
27 :
28 : CPL_CVSID("$Id: cpl_recode_iconv.cpp 24555 2012-06-10 09:49:55Z rouault $");
29 :
30 : #ifdef CPL_RECODE_ICONV
31 :
32 : #include <iconv.h>
33 : #include "cpl_string.h"
34 :
35 : #ifndef ICONV_CPP_CONST
36 : #define ICONV_CPP_CONST ICONV_CONST
37 : #endif
38 :
39 : #define CPL_RECODE_DSTBUF_SIZE 32768
40 :
41 : /************************************************************************/
42 : /* CPLClearRecodeIconvWarningFlags() */
43 : /************************************************************************/
44 :
45 : static int bHaveWarned1 = FALSE;
46 : static int bHaveWarned2 = FALSE;
47 :
48 6786 : void CPLClearRecodeIconvWarningFlags()
49 : {
50 6786 : bHaveWarned1 = FALSE;
51 6786 : bHaveWarned2 = FALSE;
52 6786 : }
53 :
54 : /************************************************************************/
55 : /* CPLRecodeIconv() */
56 : /************************************************************************/
57 :
58 : /**
59 : * Convert a string from a source encoding to a destination encoding
60 : * using the iconv() function.
61 : *
62 : * If an error occurs an error may, or may not be posted with CPLError().
63 : *
64 : * @param pszSource a NULL terminated string.
65 : * @param pszSrcEncoding the source encoding.
66 : * @param pszDstEncoding the destination encoding.
67 : *
68 : * @return a NULL terminated string which should be freed with CPLFree().
69 : */
70 :
71 35 : char *CPLRecodeIconv( const char *pszSource,
72 : const char *pszSrcEncoding,
73 : const char *pszDstEncoding )
74 :
75 : {
76 : iconv_t sConv;
77 :
78 35 : sConv = iconv_open( pszDstEncoding, pszSrcEncoding );
79 :
80 35 : if ( sConv == (iconv_t)-1 )
81 : {
82 : CPLError( CE_Warning, CPLE_AppDefined,
83 : "Recode from %s to %s failed with the error: \"%s\".",
84 1 : pszSrcEncoding, pszDstEncoding, strerror(errno) );
85 :
86 1 : return CPLStrdup(pszSource);
87 : }
88 :
89 : /* -------------------------------------------------------------------- */
90 : /* XXX: There is a portability issue: iconv() function could be */
91 : /* declared differently on different platforms. The second */
92 : /* argument could be declared as char** (as POSIX defines) or */
93 : /* as a const char**. Handle it with the ICONV_CPP_CONST macro here. */
94 : /* -------------------------------------------------------------------- */
95 34 : ICONV_CPP_CONST char *pszSrcBuf = (ICONV_CPP_CONST char *)pszSource;
96 34 : size_t nSrcLen = strlen( pszSource );
97 34 : size_t nDstCurLen = MAX(CPL_RECODE_DSTBUF_SIZE, nSrcLen + 1);
98 34 : size_t nDstLen = nDstCurLen;
99 34 : char *pszDestination = (char *)CPLCalloc( nDstCurLen, sizeof(char) );
100 34 : char *pszDstBuf = pszDestination;
101 :
102 102 : while ( nSrcLen > 0 )
103 : {
104 : size_t nConverted =
105 34 : iconv( sConv, &pszSrcBuf, &nSrcLen, &pszDstBuf, &nDstLen );
106 :
107 34 : if ( nConverted == (size_t)-1 )
108 : {
109 0 : if ( errno == EILSEQ )
110 : {
111 : // Skip the invalid sequence in the input string.
112 0 : if (!bHaveWarned1)
113 : {
114 0 : bHaveWarned1 = TRUE;
115 : CPLError(CE_Warning, CPLE_AppDefined,
116 : "One or several characters couldn't be converted correctly from %s to %s.\n"
117 : "This warning will not be emitted anymore",
118 0 : pszSrcEncoding, pszDstEncoding);
119 : }
120 0 : nSrcLen--, pszSrcBuf++;
121 0 : continue;
122 : }
123 :
124 0 : else if ( errno == E2BIG )
125 : {
126 : // We are running out of the output buffer.
127 : // Dynamically increase the buffer size.
128 0 : size_t nTmp = nDstCurLen;
129 0 : nDstCurLen *= 2;
130 : pszDestination =
131 0 : (char *)CPLRealloc( pszDestination, nDstCurLen );
132 0 : pszDstBuf = pszDestination + nTmp - nDstLen;
133 0 : nDstLen += nDstCurLen - nTmp;
134 0 : continue;
135 : }
136 :
137 : else
138 0 : break;
139 : }
140 : }
141 :
142 34 : pszDestination[nDstCurLen - nDstLen] = '\0';
143 :
144 34 : iconv_close( sConv );
145 :
146 34 : return pszDestination;
147 : }
148 :
149 : /************************************************************************/
150 : /* CPLRecodeFromWCharIconv() */
151 : /************************************************************************/
152 :
153 : /**
154 : * Convert wchar_t string to UTF-8.
155 : *
156 : * Convert a wchar_t string into a multibyte utf-8 string
157 : * using the iconv() function.
158 : *
159 : * Note that the wchar_t type varies in size on different systems. On
160 : * win32 it is normally 2 bytes, and on unix 4 bytes.
161 : *
162 : * If an error occurs an error may, or may not be posted with CPLError().
163 : *
164 : * @param pwszSource the source wchar_t string, terminated with a 0 wchar_t.
165 : * @param pszSrcEncoding the source encoding, typically CPL_ENC_UCS2.
166 : * @param pszDstEncoding the destination encoding, typically CPL_ENC_UTF8.
167 : *
168 : * @return a zero terminated multi-byte string which should be freed with
169 : * CPLFree(), or NULL if an error occurs.
170 : */
171 :
172 0 : char *CPLRecodeFromWCharIconv( const wchar_t *pwszSource,
173 : const char *pszSrcEncoding,
174 : const char *pszDstEncoding )
175 :
176 : {
177 : /* -------------------------------------------------------------------- */
178 : /* What is the source length. */
179 : /* -------------------------------------------------------------------- */
180 0 : size_t nSrcLen = 0;
181 :
182 0 : while ( pwszSource[nSrcLen] != 0 )
183 0 : nSrcLen++;
184 :
185 : /* -------------------------------------------------------------------- */
186 : /* iconv() does not support wchar_t so we need to repack the */
187 : /* characters according to the width of a character in the */
188 : /* source encoding. For instance if wchar_t is 4 bytes but our */
189 : /* source is UTF16 then we need to pack down into 2 byte */
190 : /* characters before passing to iconv(). */
191 : /* -------------------------------------------------------------------- */
192 0 : int nTargetCharWidth = CPLEncodingCharSize( pszSrcEncoding );
193 :
194 0 : if( nTargetCharWidth < 1 )
195 : {
196 : CPLError( CE_Warning, CPLE_AppDefined,
197 : "Recode from %s with CPLRecodeFromWChar() failed because"
198 : " the width of characters in the encoding are not known.",
199 0 : pszSrcEncoding );
200 0 : return CPLStrdup("");
201 : }
202 :
203 0 : GByte *pszIconvSrcBuf = (GByte*) CPLCalloc((nSrcLen+1),nTargetCharWidth);
204 : unsigned int iSrc;
205 :
206 0 : for( iSrc = 0; iSrc <= nSrcLen; iSrc++ )
207 : {
208 0 : if( nTargetCharWidth == 1 )
209 0 : pszIconvSrcBuf[iSrc] = (GByte) pwszSource[iSrc];
210 0 : else if( nTargetCharWidth == 2 )
211 0 : ((short *)pszIconvSrcBuf)[iSrc] = (short) pwszSource[iSrc];
212 0 : else if( nTargetCharWidth == 4 )
213 0 : ((GInt32 *)pszIconvSrcBuf)[iSrc] = pwszSource[iSrc];
214 : }
215 :
216 : /* -------------------------------------------------------------------- */
217 : /* Create the iconv() translation object. */
218 : /* -------------------------------------------------------------------- */
219 : iconv_t sConv;
220 :
221 0 : sConv = iconv_open( pszDstEncoding, pszSrcEncoding );
222 :
223 0 : if ( sConv == (iconv_t)-1 )
224 : {
225 0 : CPLFree( pszIconvSrcBuf );
226 : CPLError( CE_Warning, CPLE_AppDefined,
227 : "Recode from %s to %s failed with the error: \"%s\".",
228 0 : pszSrcEncoding, pszDstEncoding, strerror(errno) );
229 :
230 0 : return CPLStrdup( "" );
231 : }
232 :
233 : /* -------------------------------------------------------------------- */
234 : /* XXX: There is a portability issue: iconv() function could be */
235 : /* declared differently on different platforms. The second */
236 : /* argument could be declared as char** (as POSIX defines) or */
237 : /* as a const char**. Handle it with the ICONV_CPP_CONST macro here. */
238 : /* -------------------------------------------------------------------- */
239 0 : ICONV_CPP_CONST char *pszSrcBuf = (ICONV_CPP_CONST char *) pszIconvSrcBuf;
240 :
241 : /* iconv expects a number of bytes, not characters */
242 0 : nSrcLen *= sizeof(wchar_t);
243 :
244 : /* -------------------------------------------------------------------- */
245 : /* Allocate destination buffer. */
246 : /* -------------------------------------------------------------------- */
247 0 : size_t nDstCurLen = MAX(CPL_RECODE_DSTBUF_SIZE, nSrcLen + 1);
248 0 : size_t nDstLen = nDstCurLen;
249 0 : char *pszDestination = (char *)CPLCalloc( nDstCurLen, sizeof(char) );
250 0 : char *pszDstBuf = pszDestination;
251 :
252 0 : while ( nSrcLen > 0 )
253 : {
254 : size_t nConverted =
255 0 : iconv( sConv, &pszSrcBuf, &nSrcLen, &pszDstBuf, &nDstLen );
256 :
257 0 : if ( nConverted == (size_t)-1 )
258 : {
259 0 : if ( errno == EILSEQ )
260 : {
261 : // Skip the invalid sequence in the input string.
262 0 : nSrcLen--;
263 0 : pszSrcBuf += sizeof(wchar_t);
264 0 : if (!bHaveWarned2)
265 : {
266 0 : bHaveWarned2 = TRUE;
267 : CPLError(CE_Warning, CPLE_AppDefined,
268 : "One or several characters couldn't be converted correctly from %s to %s.\n"
269 : "This warning will not be emitted anymore",
270 0 : pszSrcEncoding, pszDstEncoding);
271 : }
272 0 : continue;
273 : }
274 :
275 0 : else if ( errno == E2BIG )
276 : {
277 : // We are running out of the output buffer.
278 : // Dynamically increase the buffer size.
279 0 : size_t nTmp = nDstCurLen;
280 0 : nDstCurLen *= 2;
281 : pszDestination =
282 0 : (char *)CPLRealloc( pszDestination, nDstCurLen );
283 0 : pszDstBuf = pszDestination + nTmp - nDstLen;
284 0 : nDstLen += nDstCurLen - nTmp;
285 0 : continue;
286 : }
287 :
288 : else
289 0 : break;
290 : }
291 : }
292 :
293 0 : pszDestination[nDstCurLen - nDstLen] = '\0';
294 :
295 0 : iconv_close( sConv );
296 :
297 0 : CPLFree( pszIconvSrcBuf );
298 :
299 0 : return pszDestination;
300 : }
301 :
302 : /************************************************************************/
303 : /* CPLRecodeToWCharIconv() */
304 : /************************************************************************/
305 :
306 : /**
307 : * Convert UTF-8 string to a wchar_t string.
308 : *
309 : * Convert a 8bit, multi-byte per character input string into a wide
310 : * character (wchar_t) string using the iconv() function.
311 : *
312 : * Note that the wchar_t type varies in size on different systems. On
313 : * win32 it is normally 2 bytes, and on unix 4 bytes.
314 : *
315 : * If an error occurs an error may, or may not be posted with CPLError().
316 : *
317 : * @param pszSource input multi-byte character string.
318 : * @param pszSrcEncoding source encoding, typically CPL_ENC_UTF8.
319 : * @param pszDstEncoding destination encoding, typically CPL_ENC_UCS2.
320 : *
321 : * @return the zero terminated wchar_t string (to be freed with CPLFree()) or
322 : * NULL on error.
323 : */
324 :
325 0 : wchar_t *CPLRecodeToWCharIconv( const char *pszSource,
326 : const char *pszSrcEncoding,
327 : const char *pszDstEncoding )
328 :
329 : {
330 : return (wchar_t *)CPLRecodeIconv( pszSource,
331 0 : pszSrcEncoding, pszDstEncoding);
332 : }
333 :
334 : #endif /* CPL_RECODE_ICONV */
|