1 : /**********************************************************************
2 : * $Id: cpl_recode_iconv.cpp 22600 2011-06-28 13:36:36Z warmerdam $
3 : *
4 : * Name: cpl_recode_iconv.cpp
5 : * Project: CPL - Common Portability Library
6 : * Purpose: Character set recoding and char/wchar_t conversions implemented
7 : * using the iconv() functionality.
8 : * Author: Andrey Kiselev, dron@ak4719.spb.edu
9 : *
10 : **********************************************************************
11 : * Copyright (c) 2011, Andrey Kiselev <dron@ak4719.spb.edu>
12 : *
13 : * Permission to use, copy, modify, and distribute this software for any
14 : * purpose with or without fee is hereby granted, provided that the above
15 : * copyright notice and this permission notice appear in all copies.
16 : *
17 : * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
18 : * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
19 : * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
20 : * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
21 : * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
22 : * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
23 : * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
24 : **********************************************************************/
25 :
26 : #include "cpl_port.h"
27 :
28 : CPL_CVSID("$Id: cpl_recode_iconv.cpp 22600 2011-06-28 13:36:36Z warmerdam $");
29 :
30 : #ifdef CPL_RECODE_ICONV
31 :
32 : #include <iconv.h>
33 : #include "cpl_string.h"
34 :
35 : #define CPL_RECODE_DSTBUF_SIZE 32768
36 :
37 : /************************************************************************/
38 : /* CPLRecodeIconv() */
39 : /************************************************************************/
40 :
41 : /**
42 : * Convert a string from a source encoding to a destination encoding
43 : * using the iconv() function.
44 : *
45 : * If an error occurs an error may, or may not be posted with CPLError().
46 : *
47 : * @param pszSource a NULL terminated string.
48 : * @param pszSrcEncoding the source encoding.
49 : * @param pszDstEncoding the destination encoding.
50 : *
51 : * @return a NULL terminated string which should be freed with CPLFree().
52 : */
53 :
54 2 : char *CPLRecodeIconv( const char *pszSource,
55 : const char *pszSrcEncoding,
56 : const char *pszDstEncoding )
57 :
58 : {
59 : iconv_t sConv;
60 :
61 2 : sConv = iconv_open( pszDstEncoding, pszSrcEncoding );
62 :
63 2 : if ( sConv == (iconv_t)-1 )
64 : {
65 : CPLError( CE_Warning, CPLE_AppDefined,
66 : "Recode from %s to %s failed with the error: \"%s\".",
67 0 : pszSrcEncoding, pszDstEncoding, strerror(errno) );
68 :
69 0 : return CPLStrdup(pszSource);
70 : }
71 :
72 : /* -------------------------------------------------------------------- */
73 : /* XXX: There is a portability issue: iconv() function could be */
74 : /* declared differently on different platforms. The second */
75 : /* argument could be declared as char** (as POSIX defines) or */
76 : /* as a const char**. Handle it with the ICONV_CONST macro here. */
77 : /* -------------------------------------------------------------------- */
78 2 : ICONV_CONST char *pszSrcBuf = (ICONV_CONST char *)pszSource;
79 2 : size_t nSrcLen = strlen( pszSource );
80 2 : size_t nDstCurLen = MAX(CPL_RECODE_DSTBUF_SIZE, nSrcLen + 1);
81 2 : size_t nDstLen = nDstCurLen;
82 2 : char *pszDestination = (char *)CPLCalloc( nDstCurLen, sizeof(char) );
83 2 : char *pszDstBuf = pszDestination;
84 :
85 6 : while ( nSrcLen > 0 )
86 : {
87 : size_t nConverted =
88 2 : iconv( sConv, &pszSrcBuf, &nSrcLen, &pszDstBuf, &nDstLen );
89 :
90 2 : if ( nConverted == (size_t)-1 )
91 : {
92 0 : if ( errno == EILSEQ )
93 : {
94 : // Skip the invalid sequence in the input string.
95 : static int bHasWarned = FALSE;
96 0 : if (!bHasWarned)
97 : {
98 0 : bHasWarned = TRUE;
99 : CPLError(CE_Warning, CPLE_AppDefined,
100 : "One or several characters couldn't be converted correctly from %s to %s.\n"
101 : "This warning will not be emitted anymore",
102 0 : pszSrcEncoding, pszDstEncoding);
103 : }
104 0 : nSrcLen--, pszSrcBuf++;
105 0 : continue;
106 : }
107 :
108 0 : else if ( errno == E2BIG )
109 : {
110 : // We are running out of the output buffer.
111 : // Dynamically increase the buffer size.
112 0 : size_t nTmp = nDstCurLen;
113 0 : nDstCurLen *= 2;
114 : pszDestination =
115 0 : (char *)CPLRealloc( pszDestination, nDstCurLen );
116 0 : pszDstBuf = pszDestination + nTmp - nDstLen;
117 0 : nDstLen += nDstCurLen - nTmp;
118 0 : continue;
119 : }
120 :
121 : else
122 0 : break;
123 : }
124 : }
125 :
126 2 : pszDestination[nDstCurLen - nDstLen] = '\0';
127 :
128 2 : iconv_close( sConv );
129 :
130 2 : return pszDestination;
131 : }
132 :
133 : /************************************************************************/
134 : /* CPLRecodeFromWCharIconv() */
135 : /************************************************************************/
136 :
137 : /**
138 : * Convert wchar_t string to UTF-8.
139 : *
140 : * Convert a wchar_t string into a multibyte utf-8 string
141 : * using the iconv() function.
142 : *
143 : * Note that the wchar_t type varies in size on different systems. On
144 : * win32 it is normally 2 bytes, and on unix 4 bytes.
145 : *
146 : * If an error occurs an error may, or may not be posted with CPLError().
147 : *
148 : * @param pwszSource the source wchar_t string, terminated with a 0 wchar_t.
149 : * @param pszSrcEncoding the source encoding, typically CPL_ENC_UCS2.
150 : * @param pszDstEncoding the destination encoding, typically CPL_ENC_UTF8.
151 : *
152 : * @return a zero terminated multi-byte string which should be freed with
153 : * CPLFree(), or NULL if an error occurs.
154 : */
155 :
156 0 : char *CPLRecodeFromWCharIconv( const wchar_t *pwszSource,
157 : const char *pszSrcEncoding,
158 : const char *pszDstEncoding )
159 :
160 : {
161 : /* -------------------------------------------------------------------- */
162 : /* What is the source length. */
163 : /* -------------------------------------------------------------------- */
164 0 : size_t nSrcLen = 0;
165 :
166 0 : while ( pwszSource[nSrcLen] != 0 )
167 0 : nSrcLen++;
168 :
169 : /* -------------------------------------------------------------------- */
170 : /* iconv() does not support wchar_t so we need to repack the */
171 : /* characters according to the width of a character in the */
172 : /* source encoding. For instance if wchar_t is 4 bytes but our */
173 : /* source is UTF16 then we need to pack down into 2 byte */
174 : /* characters before passing to iconv(). */
175 : /* -------------------------------------------------------------------- */
176 0 : int nTargetCharWidth = CPLEncodingCharSize( pszSrcEncoding );
177 :
178 0 : if( nTargetCharWidth < 1 )
179 : {
180 : CPLError( CE_Warning, CPLE_AppDefined,
181 : "Recode from %s with CPLRecodeFromWChar() failed because"
182 : " the width of characters in the encoding are not known.",
183 0 : pszSrcEncoding );
184 0 : return CPLStrdup("");
185 : }
186 :
187 0 : GByte *pszIconvSrcBuf = (GByte*) CPLCalloc((nSrcLen+1),nTargetCharWidth);
188 : unsigned int iSrc;
189 :
190 0 : for( iSrc = 0; iSrc <= nSrcLen; iSrc++ )
191 : {
192 0 : if( nTargetCharWidth == 1 )
193 0 : pszIconvSrcBuf[iSrc] = (GByte) pwszSource[iSrc];
194 0 : else if( nTargetCharWidth == 2 )
195 0 : ((short *)pszIconvSrcBuf)[iSrc] = (short) pwszSource[iSrc];
196 0 : else if( nTargetCharWidth == 4 )
197 0 : ((GInt32 *)pszIconvSrcBuf)[iSrc] = pwszSource[iSrc];
198 : }
199 :
200 : /* -------------------------------------------------------------------- */
201 : /* Create the iconv() translation object. */
202 : /* -------------------------------------------------------------------- */
203 : iconv_t sConv;
204 :
205 0 : sConv = iconv_open( pszDstEncoding, pszSrcEncoding );
206 :
207 0 : if ( sConv == (iconv_t)-1 )
208 : {
209 0 : CPLFree( pszIconvSrcBuf );
210 : CPLError( CE_Warning, CPLE_AppDefined,
211 : "Recode from %s to %s failed with the error: \"%s\".",
212 0 : pszSrcEncoding, pszDstEncoding, strerror(errno) );
213 :
214 0 : return CPLStrdup( "" );
215 : }
216 :
217 : /* -------------------------------------------------------------------- */
218 : /* XXX: There is a portability issue: iconv() function could be */
219 : /* declared differently on different platforms. The second */
220 : /* argument could be declared as char** (as POSIX defines) or */
221 : /* as a const char**. Handle it with the ICONV_CONST macro here. */
222 : /* -------------------------------------------------------------------- */
223 0 : ICONV_CONST char *pszSrcBuf = (ICONV_CONST char *) pszIconvSrcBuf;
224 :
225 : /* iconv expects a number of bytes, not characters */
226 0 : nSrcLen *= sizeof(wchar_t);
227 :
228 : /* -------------------------------------------------------------------- */
229 : /* Allocate destination buffer. */
230 : /* -------------------------------------------------------------------- */
231 0 : size_t nDstCurLen = MAX(CPL_RECODE_DSTBUF_SIZE, nSrcLen + 1);
232 0 : size_t nDstLen = nDstCurLen;
233 0 : char *pszDestination = (char *)CPLCalloc( nDstCurLen, sizeof(char) );
234 0 : char *pszDstBuf = pszDestination;
235 :
236 0 : while ( nSrcLen > 0 )
237 : {
238 : size_t nConverted =
239 0 : iconv( sConv, &pszSrcBuf, &nSrcLen, &pszDstBuf, &nDstLen );
240 :
241 0 : if ( nConverted == (size_t)-1 )
242 : {
243 0 : if ( errno == EILSEQ )
244 : {
245 : // Skip the invalid sequence in the input string.
246 0 : nSrcLen--;
247 0 : pszSrcBuf += sizeof(wchar_t);
248 : static int bHasWarned = FALSE;
249 0 : if (!bHasWarned)
250 : {
251 0 : bHasWarned = TRUE;
252 : CPLError(CE_Warning, CPLE_AppDefined,
253 : "One or several characters couldn't be converted correctly from %s to %s.\n"
254 : "This warning will not be emitted anymore",
255 0 : pszSrcEncoding, pszDstEncoding);
256 : }
257 0 : continue;
258 : }
259 :
260 0 : else if ( errno == E2BIG )
261 : {
262 : // We are running out of the output buffer.
263 : // Dynamically increase the buffer size.
264 0 : size_t nTmp = nDstCurLen;
265 0 : nDstCurLen *= 2;
266 : pszDestination =
267 0 : (char *)CPLRealloc( pszDestination, nDstCurLen );
268 0 : pszDstBuf = pszDestination + nTmp - nDstLen;
269 0 : nDstLen += nDstCurLen - nTmp;
270 0 : continue;
271 : }
272 :
273 : else
274 0 : break;
275 : }
276 : }
277 :
278 0 : pszDestination[nDstCurLen - nDstLen] = '\0';
279 :
280 0 : iconv_close( sConv );
281 :
282 0 : CPLFree( pszIconvSrcBuf );
283 :
284 0 : return pszDestination;
285 : }
286 :
287 : /************************************************************************/
288 : /* CPLRecodeToWCharIconv() */
289 : /************************************************************************/
290 :
291 : /**
292 : * Convert UTF-8 string to a wchar_t string.
293 : *
294 : * Convert a 8bit, multi-byte per character input string into a wide
295 : * character (wchar_t) string using the iconv() function.
296 : *
297 : * Note that the wchar_t type varies in size on different systems. On
298 : * win32 it is normally 2 bytes, and on unix 4 bytes.
299 : *
300 : * If an error occurs an error may, or may not be posted with CPLError().
301 : *
302 : * @param pszSource input multi-byte character string.
303 : * @param pszSrcEncoding source encoding, typically CPL_ENC_UTF8.
304 : * @param pszDstEncoding destination encoding, typically CPL_ENC_UCS2.
305 : *
306 : * @return the zero terminated wchar_t string (to be freed with CPLFree()) or
307 : * NULL on error.
308 : */
309 :
310 0 : wchar_t *CPLRecodeToWCharIconv( const char *pszSource,
311 : const char *pszSrcEncoding,
312 : const char *pszDstEncoding )
313 :
314 : {
315 : return (wchar_t *)CPLRecodeIconv( pszSource,
316 0 : pszSrcEncoding, pszDstEncoding);
317 : }
318 :
319 : #endif /* CPL_RECODE_ICONV */
|