1 : /**********************************************************************
2 : * $Id: cpl_recode_iconv.cpp 23653 2011-12-29 14:27:11Z rouault $
3 : *
4 : * Name: cpl_recode_iconv.cpp
5 : * Project: CPL - Common Portability Library
6 : * Purpose: Character set recoding and char/wchar_t conversions implemented
7 : * using the iconv() functionality.
8 : * Author: Andrey Kiselev, dron@ak4719.spb.edu
9 : *
10 : **********************************************************************
11 : * Copyright (c) 2011, Andrey Kiselev <dron@ak4719.spb.edu>
12 : *
13 : * Permission to use, copy, modify, and distribute this software for any
14 : * purpose with or without fee is hereby granted, provided that the above
15 : * copyright notice and this permission notice appear in all copies.
16 : *
17 : * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
18 : * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
19 : * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
20 : * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
21 : * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
22 : * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
23 : * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
24 : **********************************************************************/
25 :
26 : #include "cpl_port.h"
27 :
28 : CPL_CVSID("$Id: cpl_recode_iconv.cpp 23653 2011-12-29 14:27:11Z rouault $");
29 :
30 : #ifdef CPL_RECODE_ICONV
31 :
32 : #include <iconv.h>
33 : #include "cpl_string.h"
34 :
35 : #ifndef ICONV_CPP_CONST
36 : #define ICONV_CPP_CONST ICONV_CONST
37 : #endif
38 :
39 : #define CPL_RECODE_DSTBUF_SIZE 32768
40 :
41 : /************************************************************************/
42 : /* CPLRecodeIconv() */
43 : /************************************************************************/
44 :
45 : /**
46 : * Convert a string from a source encoding to a destination encoding
47 : * using the iconv() function.
48 : *
49 : * If an error occurs an error may, or may not be posted with CPLError().
50 : *
51 : * @param pszSource a NULL terminated string.
52 : * @param pszSrcEncoding the source encoding.
53 : * @param pszDstEncoding the destination encoding.
54 : *
55 : * @return a NULL terminated string which should be freed with CPLFree().
56 : */
57 :
58 644 : char *CPLRecodeIconv( const char *pszSource,
59 : const char *pszSrcEncoding,
60 : const char *pszDstEncoding )
61 :
62 : {
63 : iconv_t sConv;
64 :
65 644 : sConv = iconv_open( pszDstEncoding, pszSrcEncoding );
66 :
67 644 : if ( sConv == (iconv_t)-1 )
68 : {
69 : CPLError( CE_Warning, CPLE_AppDefined,
70 : "Recode from %s to %s failed with the error: \"%s\".",
71 0 : pszSrcEncoding, pszDstEncoding, strerror(errno) );
72 :
73 0 : return CPLStrdup(pszSource);
74 : }
75 :
76 : /* -------------------------------------------------------------------- */
77 : /* XXX: There is a portability issue: iconv() function could be */
78 : /* declared differently on different platforms. The second */
79 : /* argument could be declared as char** (as POSIX defines) or */
80 : /* as a const char**. Handle it with the ICONV_CPP_CONST macro here. */
81 : /* -------------------------------------------------------------------- */
82 644 : ICONV_CPP_CONST char *pszSrcBuf = (ICONV_CPP_CONST char *)pszSource;
83 644 : size_t nSrcLen = strlen( pszSource );
84 644 : size_t nDstCurLen = MAX(CPL_RECODE_DSTBUF_SIZE, nSrcLen + 1);
85 644 : size_t nDstLen = nDstCurLen;
86 644 : char *pszDestination = (char *)CPLCalloc( nDstCurLen, sizeof(char) );
87 644 : char *pszDstBuf = pszDestination;
88 :
89 1932 : while ( nSrcLen > 0 )
90 : {
91 : size_t nConverted =
92 644 : iconv( sConv, &pszSrcBuf, &nSrcLen, &pszDstBuf, &nDstLen );
93 :
94 644 : if ( nConverted == (size_t)-1 )
95 : {
96 0 : if ( errno == EILSEQ )
97 : {
98 : // Skip the invalid sequence in the input string.
99 : static int bHasWarned = FALSE;
100 0 : if (!bHasWarned)
101 : {
102 0 : bHasWarned = TRUE;
103 : CPLError(CE_Warning, CPLE_AppDefined,
104 : "One or several characters couldn't be converted correctly from %s to %s.\n"
105 : "This warning will not be emitted anymore",
106 0 : pszSrcEncoding, pszDstEncoding);
107 : }
108 0 : nSrcLen--, pszSrcBuf++;
109 0 : continue;
110 : }
111 :
112 0 : else if ( errno == E2BIG )
113 : {
114 : // We are running out of the output buffer.
115 : // Dynamically increase the buffer size.
116 0 : size_t nTmp = nDstCurLen;
117 0 : nDstCurLen *= 2;
118 : pszDestination =
119 0 : (char *)CPLRealloc( pszDestination, nDstCurLen );
120 0 : pszDstBuf = pszDestination + nTmp - nDstLen;
121 0 : nDstLen += nDstCurLen - nTmp;
122 0 : continue;
123 : }
124 :
125 : else
126 0 : break;
127 : }
128 : }
129 :
130 644 : pszDestination[nDstCurLen - nDstLen] = '\0';
131 :
132 644 : iconv_close( sConv );
133 :
134 644 : return pszDestination;
135 : }
136 :
137 : /************************************************************************/
138 : /* CPLRecodeFromWCharIconv() */
139 : /************************************************************************/
140 :
141 : /**
142 : * Convert wchar_t string to UTF-8.
143 : *
144 : * Convert a wchar_t string into a multibyte utf-8 string
145 : * using the iconv() function.
146 : *
147 : * Note that the wchar_t type varies in size on different systems. On
148 : * win32 it is normally 2 bytes, and on unix 4 bytes.
149 : *
150 : * If an error occurs an error may, or may not be posted with CPLError().
151 : *
152 : * @param pwszSource the source wchar_t string, terminated with a 0 wchar_t.
153 : * @param pszSrcEncoding the source encoding, typically CPL_ENC_UCS2.
154 : * @param pszDstEncoding the destination encoding, typically CPL_ENC_UTF8.
155 : *
156 : * @return a zero terminated multi-byte string which should be freed with
157 : * CPLFree(), or NULL if an error occurs.
158 : */
159 :
160 0 : char *CPLRecodeFromWCharIconv( const wchar_t *pwszSource,
161 : const char *pszSrcEncoding,
162 : const char *pszDstEncoding )
163 :
164 : {
165 : /* -------------------------------------------------------------------- */
166 : /* What is the source length. */
167 : /* -------------------------------------------------------------------- */
168 0 : size_t nSrcLen = 0;
169 :
170 0 : while ( pwszSource[nSrcLen] != 0 )
171 0 : nSrcLen++;
172 :
173 : /* -------------------------------------------------------------------- */
174 : /* iconv() does not support wchar_t so we need to repack the */
175 : /* characters according to the width of a character in the */
176 : /* source encoding. For instance if wchar_t is 4 bytes but our */
177 : /* source is UTF16 then we need to pack down into 2 byte */
178 : /* characters before passing to iconv(). */
179 : /* -------------------------------------------------------------------- */
180 0 : int nTargetCharWidth = CPLEncodingCharSize( pszSrcEncoding );
181 :
182 0 : if( nTargetCharWidth < 1 )
183 : {
184 : CPLError( CE_Warning, CPLE_AppDefined,
185 : "Recode from %s with CPLRecodeFromWChar() failed because"
186 : " the width of characters in the encoding are not known.",
187 0 : pszSrcEncoding );
188 0 : return CPLStrdup("");
189 : }
190 :
191 0 : GByte *pszIconvSrcBuf = (GByte*) CPLCalloc((nSrcLen+1),nTargetCharWidth);
192 : unsigned int iSrc;
193 :
194 0 : for( iSrc = 0; iSrc <= nSrcLen; iSrc++ )
195 : {
196 0 : if( nTargetCharWidth == 1 )
197 0 : pszIconvSrcBuf[iSrc] = (GByte) pwszSource[iSrc];
198 0 : else if( nTargetCharWidth == 2 )
199 0 : ((short *)pszIconvSrcBuf)[iSrc] = (short) pwszSource[iSrc];
200 0 : else if( nTargetCharWidth == 4 )
201 0 : ((GInt32 *)pszIconvSrcBuf)[iSrc] = pwszSource[iSrc];
202 : }
203 :
204 : /* -------------------------------------------------------------------- */
205 : /* Create the iconv() translation object. */
206 : /* -------------------------------------------------------------------- */
207 : iconv_t sConv;
208 :
209 0 : sConv = iconv_open( pszDstEncoding, pszSrcEncoding );
210 :
211 0 : if ( sConv == (iconv_t)-1 )
212 : {
213 0 : CPLFree( pszIconvSrcBuf );
214 : CPLError( CE_Warning, CPLE_AppDefined,
215 : "Recode from %s to %s failed with the error: \"%s\".",
216 0 : pszSrcEncoding, pszDstEncoding, strerror(errno) );
217 :
218 0 : return CPLStrdup( "" );
219 : }
220 :
221 : /* -------------------------------------------------------------------- */
222 : /* XXX: There is a portability issue: iconv() function could be */
223 : /* declared differently on different platforms. The second */
224 : /* argument could be declared as char** (as POSIX defines) or */
225 : /* as a const char**. Handle it with the ICONV_CPP_CONST macro here. */
226 : /* -------------------------------------------------------------------- */
227 0 : ICONV_CPP_CONST char *pszSrcBuf = (ICONV_CPP_CONST char *) pszIconvSrcBuf;
228 :
229 : /* iconv expects a number of bytes, not characters */
230 0 : nSrcLen *= sizeof(wchar_t);
231 :
232 : /* -------------------------------------------------------------------- */
233 : /* Allocate destination buffer. */
234 : /* -------------------------------------------------------------------- */
235 0 : size_t nDstCurLen = MAX(CPL_RECODE_DSTBUF_SIZE, nSrcLen + 1);
236 0 : size_t nDstLen = nDstCurLen;
237 0 : char *pszDestination = (char *)CPLCalloc( nDstCurLen, sizeof(char) );
238 0 : char *pszDstBuf = pszDestination;
239 :
240 0 : while ( nSrcLen > 0 )
241 : {
242 : size_t nConverted =
243 0 : iconv( sConv, &pszSrcBuf, &nSrcLen, &pszDstBuf, &nDstLen );
244 :
245 0 : if ( nConverted == (size_t)-1 )
246 : {
247 0 : if ( errno == EILSEQ )
248 : {
249 : // Skip the invalid sequence in the input string.
250 0 : nSrcLen--;
251 0 : pszSrcBuf += sizeof(wchar_t);
252 : static int bHasWarned = FALSE;
253 0 : if (!bHasWarned)
254 : {
255 0 : bHasWarned = TRUE;
256 : CPLError(CE_Warning, CPLE_AppDefined,
257 : "One or several characters couldn't be converted correctly from %s to %s.\n"
258 : "This warning will not be emitted anymore",
259 0 : pszSrcEncoding, pszDstEncoding);
260 : }
261 0 : continue;
262 : }
263 :
264 0 : else if ( errno == E2BIG )
265 : {
266 : // We are running out of the output buffer.
267 : // Dynamically increase the buffer size.
268 0 : size_t nTmp = nDstCurLen;
269 0 : nDstCurLen *= 2;
270 : pszDestination =
271 0 : (char *)CPLRealloc( pszDestination, nDstCurLen );
272 0 : pszDstBuf = pszDestination + nTmp - nDstLen;
273 0 : nDstLen += nDstCurLen - nTmp;
274 0 : continue;
275 : }
276 :
277 : else
278 0 : break;
279 : }
280 : }
281 :
282 0 : pszDestination[nDstCurLen - nDstLen] = '\0';
283 :
284 0 : iconv_close( sConv );
285 :
286 0 : CPLFree( pszIconvSrcBuf );
287 :
288 0 : return pszDestination;
289 : }
290 :
291 : /************************************************************************/
292 : /* CPLRecodeToWCharIconv() */
293 : /************************************************************************/
294 :
295 : /**
296 : * Convert UTF-8 string to a wchar_t string.
297 : *
298 : * Convert a 8bit, multi-byte per character input string into a wide
299 : * character (wchar_t) string using the iconv() function.
300 : *
301 : * Note that the wchar_t type varies in size on different systems. On
302 : * win32 it is normally 2 bytes, and on unix 4 bytes.
303 : *
304 : * If an error occurs an error may, or may not be posted with CPLError().
305 : *
306 : * @param pszSource input multi-byte character string.
307 : * @param pszSrcEncoding source encoding, typically CPL_ENC_UTF8.
308 : * @param pszDstEncoding destination encoding, typically CPL_ENC_UCS2.
309 : *
310 : * @return the zero terminated wchar_t string (to be freed with CPLFree()) or
311 : * NULL on error.
312 : */
313 :
314 0 : wchar_t *CPLRecodeToWCharIconv( const char *pszSource,
315 : const char *pszSrcEncoding,
316 : const char *pszDstEncoding )
317 :
318 : {
319 : return (wchar_t *)CPLRecodeIconv( pszSource,
320 0 : pszSrcEncoding, pszDstEncoding);
321 : }
322 :
323 : #endif /* CPL_RECODE_ICONV */
|