LCOV - code coverage report
Current view: directory - port - cpl_recode.cpp (source / functions) Found Hit Coverage
Test: gdal_filtered.info Lines: 46 28 60.9 %
Date: 2012-12-26 Functions: 7 6 85.7 %

       1                 : /**********************************************************************
       2                 :  * $Id: cpl_recode.cpp 24555 2012-06-10 09:49:55Z rouault $
       3                 :  *
       4                 :  * Name:     cpl_recode.cpp
       5                 :  * Project:  CPL - Common Portability Library
       6                 :  * Purpose:  Character set recoding and char/wchar_t conversions.
       7                 :  * Author:   Andrey Kiselev, dron@ak4719.spb.edu
       8                 :  *
       9                 :  **********************************************************************
      10                 :  * Copyright (c) 2011, Andrey Kiselev <dron@ak4719.spb.edu>
      11                 :  * Copyright (c) 2008, Frank Warmerdam
      12                 :  *
      13                 :  * Permission to use, copy, modify, and distribute this software for any
      14                 :  * purpose with or without fee is hereby granted, provided that the above
      15                 :  * copyright notice and this permission notice appear in all copies.
      16                 :  *
      17                 :  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
      18                 :  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
      19                 :  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
      20                 :  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
      21                 :  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
      22                 :  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
      23                 :  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
      24                 :  **********************************************************************/
      25                 : 
      26                 : #include "cpl_string.h"
      27                 : 
      28                 : CPL_CVSID("$Id: cpl_recode.cpp 24555 2012-06-10 09:49:55Z rouault $");
      29                 : 
      30                 : #ifdef CPL_RECODE_ICONV
      31                 : extern void CPLClearRecodeIconvWarningFlags();
      32                 : extern char *CPLRecodeIconv( const char *, const char *, const char * );
      33                 : extern char *CPLRecodeFromWCharIconv( const wchar_t *,
      34                 :                                       const char *, const char * );
      35                 : extern wchar_t *CPLRecodeToWCharIconv( const char *,
      36                 :                                        const char *, const char * );
      37                 : #endif /* CPL_RECODE_ICONV */
      38                 : 
      39                 : extern void CPLClearRecodeStubWarningFlags();
      40                 : extern char *CPLRecodeStub( const char *, const char *, const char * );
      41                 : extern char *CPLRecodeFromWCharStub( const wchar_t *,
      42                 :                                      const char *, const char * );
      43                 : extern wchar_t *CPLRecodeToWCharStub( const char *,
      44                 :                                       const char *, const char * );
      45                 : extern int CPLIsUTF8Stub( const char *, int );
      46                 : 
      47                 : /************************************************************************/
      48                 : /*                             CPLRecode()                              */
      49                 : /************************************************************************/
      50                 : 
      51                 : /**
      52                 :  * Convert a string from a source encoding to a destination encoding.
      53                 :  *
      54                 :  * The only guaranteed supported encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
      55                 :  * and CPL_ENC_ISO8859_1. Currently, the following conversions are supported :
      56                 :  * <ul>
      57                 :  *  <li>CPL_ENC_ASCII -> CPL_ENC_UTF8 or CPL_ENC_ISO8859_1 (no conversion in fact)</li>
      58                 :  *  <li>CPL_ENC_ISO8859_1 -> CPL_ENC_UTF8</li>
      59                 :  *  <li>CPL_ENC_UTF8 -> CPL_ENC_ISO8859_1</li>
      60                 :  * </ul>
      61                 :  *
      62                 :  * If an error occurs an error may, or may not be posted with CPLError(). 
      63                 :  *
      64                 :  * @param pszSource a NULL terminated string.
      65                 :  * @param pszSrcEncoding the source encoding.
      66                 :  * @param pszDstEncoding the destination encoding.
      67                 :  *
      68                 :  * @return a NULL terminated string which should be freed with CPLFree().
      69                 :  *
      70                 :  * @since GDAL 1.6.0
      71                 :  */
      72                 : 
      73           55377 : char CPL_DLL *CPLRecode( const char *pszSource,
      74                 :                          const char *pszSrcEncoding,
      75                 :                          const char *pszDstEncoding )
      76                 : 
      77                 : {
      78                 : /* -------------------------------------------------------------------- */
      79                 : /*      Handle a few common short cuts.                                 */
      80                 : /* -------------------------------------------------------------------- */
      81           55377 :     if ( EQUAL(pszSrcEncoding, pszDstEncoding) )
      82               0 :         return CPLStrdup(pszSource);
      83                 : 
      84           55377 :     if ( EQUAL(pszSrcEncoding, CPL_ENC_ASCII) 
      85                 :         && ( EQUAL(pszDstEncoding, CPL_ENC_UTF8) 
      86                 :              || EQUAL(pszDstEncoding, CPL_ENC_ISO8859_1) ) )
      87               0 :         return CPLStrdup(pszSource);
      88                 : 
      89                 : #ifdef CPL_RECODE_ICONV
      90                 : /* -------------------------------------------------------------------- */
      91                 : /*      CPL_ENC_ISO8859_1 -> CPL_ENC_UTF8                               */
      92                 : /*      and CPL_ENC_UTF8 -> CPL_ENC_ISO8859_1 conversions are hadled    */
      93                 : /*      very well by the stub implementation which is faster than the   */
      94                 : /*      iconv() route. Use a stub for these two ones and iconv()        */
      95                 : /*      everything else.                                                */
      96                 : /* -------------------------------------------------------------------- */
      97           55377 :     if ( ( EQUAL(pszSrcEncoding, CPL_ENC_ISO8859_1)
      98                 :            && EQUAL(pszDstEncoding, CPL_ENC_UTF8) )
      99                 :          || ( EQUAL(pszSrcEncoding, CPL_ENC_UTF8)
     100                 :               && EQUAL(pszDstEncoding, CPL_ENC_ISO8859_1) ) )
     101                 :     {
     102           55342 :         return CPLRecodeStub( pszSource, pszSrcEncoding, pszDstEncoding );
     103                 :     }
     104                 :     else
     105                 :     {
     106              35 :         return CPLRecodeIconv( pszSource, pszSrcEncoding, pszDstEncoding );
     107                 :     }
     108                 : #else /* CPL_RECODE_STUB */
     109                 :     return CPLRecodeStub( pszSource, pszSrcEncoding, pszDstEncoding );
     110                 : #endif /* CPL_RECODE_ICONV */
     111                 : }
     112                 : 
     113                 : /************************************************************************/
     114                 : /*                         CPLRecodeFromWChar()                         */
     115                 : /************************************************************************/
     116                 : 
     117                 : /**
     118                 :  * Convert wchar_t string to UTF-8. 
     119                 :  *
     120                 :  * Convert a wchar_t string into a multibyte utf-8 string.  The only
     121                 :  * guaranteed supported source encoding is CPL_ENC_UCS2, and the only
     122                 :  * guaranteed supported destination encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
     123                 :  * and CPL_ENC_ISO8859_1.  In some cases (ie. using iconv()) other encodings 
     124                 :  * may also be supported.
     125                 :  *
     126                 :  * Note that the wchar_t type varies in size on different systems. On
     127                 :  * win32 it is normally 2 bytes, and on unix 4 bytes.
     128                 :  *
     129                 :  * If an error occurs an error may, or may not be posted with CPLError(). 
     130                 :  *
     131                 :  * @param pwszSource the source wchar_t string, terminated with a 0 wchar_t.
     132                 :  * @param pszSrcEncoding the source encoding, typically CPL_ENC_UCS2.
     133                 :  * @param pszDstEncoding the destination encoding, typically CPL_ENC_UTF8.
     134                 :  *
     135                 :  * @return a zero terminated multi-byte string which should be freed with 
     136                 :  * CPLFree(), or NULL if an error occurs. 
     137                 :  *
     138                 :  * @since GDAL 1.6.0
     139                 :  */
     140                 : 
     141           12605 : char CPL_DLL *CPLRecodeFromWChar( const wchar_t *pwszSource,
     142                 :                                   const char *pszSrcEncoding,
     143                 :                                   const char *pszDstEncoding )
     144                 : 
     145                 : {
     146                 : #ifdef CPL_RECODE_ICONV
     147                 : /* -------------------------------------------------------------------- */
     148                 : /*      Conversions from CPL_ENC_UCS2                                   */
     149                 : /*      to CPL_ENC_UTF8, CPL_ENC_ISO8859_1 and CPL_ENC_ASCII are well   */
     150                 : /*      handled by the stub implementation.                             */
     151                 : /* -------------------------------------------------------------------- */
     152           12605 :     if ( (EQUAL(pszSrcEncoding, CPL_ENC_UCS2) || EQUAL(pszSrcEncoding, "WCHAR_T"))
     153                 :          && ( EQUAL(pszDstEncoding, CPL_ENC_UTF8)
     154                 :               || EQUAL(pszDstEncoding, CPL_ENC_ASCII)
     155                 :               || EQUAL(pszDstEncoding, CPL_ENC_ISO8859_1) ) )
     156                 :     {
     157                 :         return CPLRecodeFromWCharStub( pwszSource,
     158           12605 :                                        pszSrcEncoding, pszDstEncoding );
     159                 :     }
     160                 :     else
     161                 :     {
     162                 :         return CPLRecodeFromWCharIconv( pwszSource,
     163               0 :                                         pszSrcEncoding, pszDstEncoding );
     164                 :     }
     165                 : #else /* CPL_RECODE_STUB */
     166                 :     return CPLRecodeFromWCharStub( pwszSource,
     167                 :                                    pszSrcEncoding, pszDstEncoding );
     168                 : #endif /* CPL_RECODE_ICONV */
     169                 : }
     170                 : 
     171                 : /************************************************************************/
     172                 : /*                          CPLRecodeToWChar()                          */
     173                 : /************************************************************************/
     174                 : 
     175                 : /**
     176                 :  * Convert UTF-8 string to a wchar_t string.
     177                 :  *
     178                 :  * Convert a 8bit, multi-byte per character input string into a wide
     179                 :  * character (wchar_t) string.  The only guaranteed supported source encodings
     180                 :  * are CPL_ENC_UTF8, CPL_ENC_ASCII and CPL_ENC_ISO8869_1 (LATIN1).  The only
     181                 :  * guaranteed supported destination encoding is CPL_ENC_UCS2.  Other source
     182                 :  * and destination encodings may be supported depending on the underlying
     183                 :  * implementation. 
     184                 :  *
     185                 :  * Note that the wchar_t type varies in size on different systems. On
     186                 :  * win32 it is normally 2 bytes, and on unix 4 bytes.
     187                 :  *
     188                 :  * If an error occurs an error may, or may not be posted with CPLError(). 
     189                 :  *
     190                 :  * @param pszSource input multi-byte character string.
     191                 :  * @param pszSrcEncoding source encoding, typically CPL_ENC_UTF8.
     192                 :  * @param pszDstEncoding destination encoding, typically CPL_ENC_UCS2. 
     193                 :  *
     194                 :  * @return the zero terminated wchar_t string (to be freed with CPLFree()) or
     195                 :  * NULL on error.
     196                 :  *
     197                 :  * @since GDAL 1.6.0
     198                 :  */
     199                 : 
     200            2933 : wchar_t CPL_DLL *CPLRecodeToWChar( const char *pszSource,
     201                 :                                    const char *pszSrcEncoding,
     202                 :                                    const char *pszDstEncoding )
     203                 : 
     204                 : {
     205                 : #ifdef CPL_RECODE_ICONV
     206                 : /* -------------------------------------------------------------------- */
     207                 : /*      Conversions to CPL_ENC_UCS2                                     */
     208                 : /*      from CPL_ENC_UTF8, CPL_ENC_ISO8859_1 and CPL_ENC_ASCII are well */
     209                 : /*      handled by the stub implementation.                             */
     210                 : /* -------------------------------------------------------------------- */
     211            2933 :     if ( (EQUAL(pszDstEncoding, CPL_ENC_UCS2) || EQUAL(pszDstEncoding, "WCHAR_T"))
     212                 :          && ( EQUAL(pszSrcEncoding, CPL_ENC_UTF8)
     213                 :               || EQUAL(pszSrcEncoding, CPL_ENC_ASCII)
     214                 :               || EQUAL(pszSrcEncoding, CPL_ENC_ISO8859_1) ) )
     215                 :     {
     216                 :         return CPLRecodeToWCharStub( pszSource,
     217            2933 :                                      pszSrcEncoding, pszDstEncoding );
     218                 :     }
     219                 :     else
     220                 :     {
     221                 :         return CPLRecodeToWCharIconv( pszSource,
     222               0 :                                       pszSrcEncoding, pszDstEncoding );
     223                 :     }
     224                 : #else /* CPL_RECODE_STUB */
     225                 :     return CPLRecodeToWCharStub( pszSource, pszSrcEncoding, pszDstEncoding );
     226                 : #endif /* CPL_RECODE_ICONV */
     227                 : }
     228                 : 
     229                 : /************************************************************************/
     230                 : /*                                 CPLIsUTF8()                          */
     231                 : /************************************************************************/
     232                 : 
     233                 : /**
     234                 :  * Test if a string is encoded as UTF-8.
     235                 :  *
     236                 :  * @param pabyData input string to test
     237                 :  * @param nLen length of the input string, or -1 if the function must compute
     238                 :  *             the string length. In which case it must be null terminated.
     239                 :  * @return TRUE if the string is encoded as UTF-8. FALSE otherwise
     240                 :  *
     241                 :  * @since GDAL 1.7.0
     242                 :  */
     243          476054 : int CPLIsUTF8(const char* pabyData, int nLen)
     244                 : {
     245          476054 :     return CPLIsUTF8Stub( pabyData, nLen );
     246                 : }
     247                 : 
     248                 : /************************************************************************/
     249                 : /*                          CPLForceToASCII()                           */
     250                 : /************************************************************************/
     251                 : 
     252                 : /**
     253                 :  * Return a new string that is made only of ASCII characters. If non-ASCII
     254                 :  * characters are found in the input string, they will be replaced by the
     255                 :  * provided replacement character.
     256                 :  *
     257                 :  * @param pabyData input string to test
     258                 :  * @param nLen length of the input string, or -1 if the function must compute
     259                 :  *             the string length. In which case it must be null terminated.
     260                 :  * @param chReplacementChar character which will be used when the input stream
     261                 :  *                          contains a non ASCII character. Must be valid ASCII !
     262                 :  *
     263                 :  * @return a new string that must be freed with CPLFree().
     264                 :  *
     265                 :  * @since GDAL 1.7.0
     266                 :  */
     267               1 : char CPL_DLL *CPLForceToASCII(const char* pabyData, int nLen, char chReplacementChar)
     268                 : {
     269               1 :     if (nLen < 0)
     270               1 :         nLen = strlen(pabyData);
     271               1 :     char* pszOutputString = (char*)CPLMalloc(nLen + 1);
     272                 :     int i;
     273               5 :     for(i=0;i<nLen;i++)
     274                 :     {
     275               4 :         if (((unsigned char*)pabyData)[i] > 127)
     276               1 :             pszOutputString[i] = chReplacementChar;
     277                 :         else
     278               3 :             pszOutputString[i] = pabyData[i];
     279                 :     }
     280               1 :     pszOutputString[i] = '\0';
     281               1 :     return pszOutputString;
     282                 : }
     283                 : 
     284                 : /************************************************************************/
     285                 : /*                        CPLEncodingCharSize()                         */
     286                 : /************************************************************************/
     287                 : 
     288                 : /**
     289                 :  * Return bytes per character for encoding.
     290                 :  *
     291                 :  * This function returns the size in bytes of the smallest character
     292                 :  * in this encoding.  For fixed width encodings (ASCII, UCS-2, UCS-4) this
     293                 :  * is straight forward.  For encodings like UTF8 and UTF16 which represent
     294                 :  * some characters as a sequence of atomic character sizes the function
     295                 :  * still returns the atomic character size (1 for UTF8, 2 for UTF16). 
     296                 :  *
     297                 :  * This function will return the correct value for well known encodings
     298                 :  * with corresponding CPL_ENC_ values.  It may not return the correct value
     299                 :  * for other encodings even if they are supported by the underlying iconv 
     300                 :  * or windows transliteration services.  Hopefully it will improve over time.
     301                 :  *
     302                 :  * @param pszEncoding the name of the encoding.
     303                 :  *
     304                 :  * @return the size of a minimal character in bytes or -1 if the size is 
     305                 :  * unknown. 
     306                 :  */
     307                 : 
     308               0 : int CPLEncodingCharSize( const char *pszEncoding )
     309                 : 
     310                 : {
     311               0 :     if( EQUAL(pszEncoding,CPL_ENC_UTF8) )
     312               0 :         return 1;
     313               0 :     else if( EQUAL(pszEncoding,CPL_ENC_UTF16) )
     314               0 :         return 2;
     315               0 :     else if( EQUAL(pszEncoding,CPL_ENC_UCS2) )
     316               0 :         return 2;
     317               0 :     else if( EQUAL(pszEncoding,CPL_ENC_UCS4) )
     318               0 :         return 4;
     319               0 :     else if( EQUAL(pszEncoding,CPL_ENC_ASCII) )
     320               0 :         return 1;
     321               0 :     else if( EQUALN(pszEncoding,"ISO-8859-",9) )
     322               0 :         return 1;
     323                 :     else
     324               0 :         return -1;
     325                 : }
     326                 : 
     327                 : /************************************************************************/
     328                 : /*                    CPLClearRecodeWarningFlags()                      */
     329                 : /************************************************************************/
     330                 : 
     331            6786 : void CPLClearRecodeWarningFlags()
     332                 : {
     333                 : #ifdef CPL_RECODE_ICONV
     334            6786 :     CPLClearRecodeIconvWarningFlags();
     335                 : #endif
     336            6786 :     CPLClearRecodeStubWarningFlags();
     337            6786 : }

Generated by: LCOV version 1.7