LCOV - code coverage report
Current view: directory - port - cpl_recode.cpp (source / functions) Found Hit Coverage
Test: gdal_filtered.info Lines: 42 24 57.1 %
Date: 2012-04-28 Functions: 6 5 83.3 %

       1                 : /**********************************************************************
       2                 :  * $Id: cpl_recode.cpp 22600 2011-06-28 13:36:36Z warmerdam $
       3                 :  *
       4                 :  * Name:     cpl_recode.cpp
       5                 :  * Project:  CPL - Common Portability Library
       6                 :  * Purpose:  Character set recoding and char/wchar_t conversions.
       7                 :  * Author:   Andrey Kiselev, dron@ak4719.spb.edu
       8                 :  *
       9                 :  **********************************************************************
      10                 :  * Copyright (c) 2011, Andrey Kiselev <dron@ak4719.spb.edu>
      11                 :  * Copyright (c) 2008, Frank Warmerdam
      12                 :  *
      13                 :  * Permission to use, copy, modify, and distribute this software for any
      14                 :  * purpose with or without fee is hereby granted, provided that the above
      15                 :  * copyright notice and this permission notice appear in all copies.
      16                 :  *
      17                 :  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
      18                 :  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
      19                 :  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
      20                 :  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
      21                 :  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
      22                 :  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
      23                 :  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
      24                 :  **********************************************************************/
      25                 : 
      26                 : #include "cpl_string.h"
      27                 : 
      28                 : CPL_CVSID("$Id: cpl_recode.cpp 22600 2011-06-28 13:36:36Z warmerdam $");
      29                 : 
      30                 : #ifdef CPL_RECODE_ICONV
      31                 : extern char *CPLRecodeIconv( const char *, const char *, const char * );
      32                 : extern char *CPLRecodeFromWCharIconv( const wchar_t *,
      33                 :                                       const char *, const char * );
      34                 : extern wchar_t *CPLRecodeToWCharIconv( const char *,
      35                 :                                        const char *, const char * );
      36                 : #endif /* CPL_RECODE_ICONV */
      37                 : 
      38                 : extern char *CPLRecodeStub( const char *, const char *, const char * );
      39                 : extern char *CPLRecodeFromWCharStub( const wchar_t *,
      40                 :                                      const char *, const char * );
      41                 : extern wchar_t *CPLRecodeToWCharStub( const char *,
      42                 :                                       const char *, const char * );
      43                 : extern int CPLIsUTF8Stub( const char *, int );
      44                 : 
      45                 : /************************************************************************/
      46                 : /*                             CPLRecode()                              */
      47                 : /************************************************************************/
      48                 : 
      49                 : /**
      50                 :  * Convert a string from a source encoding to a destination encoding.
      51                 :  *
      52                 :  * The only guaranteed supported encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
      53                 :  * and CPL_ENC_ISO8859_1. Currently, the following conversions are supported :
      54                 :  * <ul>
      55                 :  *  <li>CPL_ENC_ASCII -> CPL_ENC_UTF8 or CPL_ENC_ISO8859_1 (no conversion in fact)</li>
      56                 :  *  <li>CPL_ENC_ISO8859_1 -> CPL_ENC_UTF8</li>
      57                 :  *  <li>CPL_ENC_UTF8 -> CPL_ENC_ISO8859_1</li>
      58                 :  * </ul>
      59                 :  *
      60                 :  * If an error occurs an error may, or may not be posted with CPLError(). 
      61                 :  *
      62                 :  * @param pszSource a NULL terminated string.
      63                 :  * @param pszSrcEncoding the source encoding.
      64                 :  * @param pszDstEncoding the destination encoding.
      65                 :  *
      66                 :  * @return a NULL terminated string which should be freed with CPLFree().
      67                 :  *
      68                 :  * @since GDAL 1.6.0
      69                 :  */
      70                 : 
      71           18630 : char CPL_DLL *CPLRecode( const char *pszSource,
      72                 :                          const char *pszSrcEncoding,
      73                 :                          const char *pszDstEncoding )
      74                 : 
      75                 : {
      76                 : /* -------------------------------------------------------------------- */
      77                 : /*      Handle a few common short cuts.                                 */
      78                 : /* -------------------------------------------------------------------- */
      79           18630 :     if ( EQUAL(pszSrcEncoding, pszDstEncoding) )
      80               0 :         return CPLStrdup(pszSource);
      81                 : 
      82           18630 :     if ( EQUAL(pszSrcEncoding, CPL_ENC_ASCII) 
      83                 :         && ( EQUAL(pszDstEncoding, CPL_ENC_UTF8) 
      84                 :              || EQUAL(pszDstEncoding, CPL_ENC_ISO8859_1) ) )
      85               0 :         return CPLStrdup(pszSource);
      86                 : 
      87                 : #ifdef CPL_RECODE_ICONV
      88                 : /* -------------------------------------------------------------------- */
      89                 : /*      CPL_ENC_ISO8859_1 -> CPL_ENC_UTF8                               */
      90                 : /*      and CPL_ENC_UTF8 -> CPL_ENC_ISO8859_1 conversions are hadled    */
      91                 : /*      very well by the stub implementation which is faster than the   */
      92                 : /*      iconv() route. Use a stub for these two ones and iconv()        */
      93                 : /*      everything else.                                                */
      94                 : /* -------------------------------------------------------------------- */
      95           18630 :     if ( ( EQUAL(pszSrcEncoding, CPL_ENC_ISO8859_1)
      96                 :            && EQUAL(pszDstEncoding, CPL_ENC_UTF8) )
      97                 :          || ( EQUAL(pszSrcEncoding, CPL_ENC_UTF8)
      98                 :               && EQUAL(pszDstEncoding, CPL_ENC_ISO8859_1) ) )
      99                 :     {
     100           17986 :         return CPLRecodeStub( pszSource, pszSrcEncoding, pszDstEncoding );
     101                 :     }
     102                 :     else
     103                 :     {
     104             644 :         return CPLRecodeIconv( pszSource, pszSrcEncoding, pszDstEncoding );
     105                 :     }
     106                 : #else /* CPL_RECODE_STUB */
     107                 :     return CPLRecodeStub( pszSource, pszSrcEncoding, pszDstEncoding );
     108                 : #endif /* CPL_RECODE_ICONV */
     109                 : }
     110                 : 
     111                 : /************************************************************************/
     112                 : /*                         CPLRecodeFromWChar()                         */
     113                 : /************************************************************************/
     114                 : 
     115                 : /**
     116                 :  * Convert wchar_t string to UTF-8. 
     117                 :  *
     118                 :  * Convert a wchar_t string into a multibyte utf-8 string.  The only
     119                 :  * guaranteed supported source encoding is CPL_ENC_UCS2, and the only
     120                 :  * guaranteed supported destination encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
     121                 :  * and CPL_ENC_ISO8859_1.  In some cases (ie. using iconv()) other encodings 
     122                 :  * may also be supported.
     123                 :  *
     124                 :  * Note that the wchar_t type varies in size on different systems. On
     125                 :  * win32 it is normally 2 bytes, and on unix 4 bytes.
     126                 :  *
     127                 :  * If an error occurs an error may, or may not be posted with CPLError(). 
     128                 :  *
     129                 :  * @param pwszSource the source wchar_t string, terminated with a 0 wchar_t.
     130                 :  * @param pszSrcEncoding the source encoding, typically CPL_ENC_UCS2.
     131                 :  * @param pszDstEncoding the destination encoding, typically CPL_ENC_UTF8.
     132                 :  *
     133                 :  * @return a zero terminated multi-byte string which should be freed with 
     134                 :  * CPLFree(), or NULL if an error occurs. 
     135                 :  *
     136                 :  * @since GDAL 1.6.0
     137                 :  */
     138                 : 
     139            2248 : char CPL_DLL *CPLRecodeFromWChar( const wchar_t *pwszSource,
     140                 :                                   const char *pszSrcEncoding,
     141                 :                                   const char *pszDstEncoding )
     142                 : 
     143                 : {
     144                 : #ifdef CPL_RECODE_ICONV
     145                 : /* -------------------------------------------------------------------- */
     146                 : /*      Conversions from CPL_ENC_UCS2                                   */
     147                 : /*      to CPL_ENC_UTF8, CPL_ENC_ISO8859_1 and CPL_ENC_ASCII are well   */
     148                 : /*      handled by the stub implementation.                             */
     149                 : /* -------------------------------------------------------------------- */
     150            2248 :     if ( (EQUAL(pszSrcEncoding, CPL_ENC_UCS2) || EQUAL(pszSrcEncoding, "WCHAR_T"))
     151                 :          && ( EQUAL(pszDstEncoding, CPL_ENC_UTF8)
     152                 :               || EQUAL(pszDstEncoding, CPL_ENC_ASCII)
     153                 :               || EQUAL(pszDstEncoding, CPL_ENC_ISO8859_1) ) )
     154                 :     {
     155                 :         return CPLRecodeFromWCharStub( pwszSource,
     156            2248 :                                        pszSrcEncoding, pszDstEncoding );
     157                 :     }
     158                 :     else
     159                 :     {
     160                 :         return CPLRecodeFromWCharIconv( pwszSource,
     161               0 :                                         pszSrcEncoding, pszDstEncoding );
     162                 :     }
     163                 : #else /* CPL_RECODE_STUB */
     164                 :     return CPLRecodeFromWCharStub( pwszSource,
     165                 :                                    pszSrcEncoding, pszDstEncoding );
     166                 : #endif /* CPL_RECODE_ICONV */
     167                 : }
     168                 : 
     169                 : /************************************************************************/
     170                 : /*                          CPLRecodeToWChar()                          */
     171                 : /************************************************************************/
     172                 : 
     173                 : /**
     174                 :  * Convert UTF-8 string to a wchar_t string.
     175                 :  *
     176                 :  * Convert a 8bit, multi-byte per character input string into a wide
     177                 :  * character (wchar_t) string.  The only guaranteed supported source encodings
     178                 :  * are CPL_ENC_UTF8, CPL_ENC_ASCII and CPL_ENC_ISO8869_1 (LATIN1).  The only
     179                 :  * guaranteed supported destination encoding is CPL_ENC_UCS2.  Other source
     180                 :  * and destination encodings may be supported depending on the underlying
     181                 :  * implementation. 
     182                 :  *
     183                 :  * Note that the wchar_t type varies in size on different systems. On
     184                 :  * win32 it is normally 2 bytes, and on unix 4 bytes.
     185                 :  *
     186                 :  * If an error occurs an error may, or may not be posted with CPLError(). 
     187                 :  *
     188                 :  * @param pszSource input multi-byte character string.
     189                 :  * @param pszSrcEncoding source encoding, typically CPL_ENC_UTF8.
     190                 :  * @param pszDstEncoding destination encoding, typically CPL_ENC_UCS2. 
     191                 :  *
     192                 :  * @return the zero terminated wchar_t string (to be freed with CPLFree()) or
     193                 :  * NULL on error.
     194                 :  *
     195                 :  * @since GDAL 1.6.0
     196                 :  */
     197                 : 
     198            5140 : wchar_t CPL_DLL *CPLRecodeToWChar( const char *pszSource,
     199                 :                                    const char *pszSrcEncoding,
     200                 :                                    const char *pszDstEncoding )
     201                 : 
     202                 : {
     203                 : #ifdef CPL_RECODE_ICONV
     204                 : /* -------------------------------------------------------------------- */
     205                 : /*      Conversions to CPL_ENC_UCS2                                     */
     206                 : /*      from CPL_ENC_UTF8, CPL_ENC_ISO8859_1 and CPL_ENC_ASCII are well */
     207                 : /*      handled by the stub implementation.                             */
     208                 : /* -------------------------------------------------------------------- */
     209            5140 :     if ( (EQUAL(pszDstEncoding, CPL_ENC_UCS2) || EQUAL(pszDstEncoding, "WCHAR_T"))
     210                 :          && ( EQUAL(pszSrcEncoding, CPL_ENC_UTF8)
     211                 :               || EQUAL(pszSrcEncoding, CPL_ENC_ASCII)
     212                 :               || EQUAL(pszSrcEncoding, CPL_ENC_ISO8859_1) ) )
     213                 :     {
     214                 :         return CPLRecodeToWCharStub( pszSource,
     215            5140 :                                      pszSrcEncoding, pszDstEncoding );
     216                 :     }
     217                 :     else
     218                 :     {
     219                 :         return CPLRecodeToWCharIconv( pszSource,
     220               0 :                                       pszSrcEncoding, pszDstEncoding );
     221                 :     }
     222                 : #else /* CPL_RECODE_STUB */
     223                 :     return CPLRecodeToWCharStub( pszSource, pszSrcEncoding, pszDstEncoding );
     224                 : #endif /* CPL_RECODE_ICONV */
     225                 : }
     226                 : 
     227                 : /************************************************************************/
     228                 : /*                                 CPLIsUTF8()                          */
     229                 : /************************************************************************/
     230                 : 
     231                 : /**
     232                 :  * Test if a string is encoded as UTF-8.
     233                 :  *
     234                 :  * @param pabyData input string to test
     235                 :  * @param nLen length of the input string, or -1 if the function must compute
     236                 :  *             the string length. In which case it must be null terminated.
     237                 :  * @return TRUE if the string is encoded as UTF-8. FALSE otherwise
     238                 :  *
     239                 :  * @since GDAL 1.7.0
     240                 :  */
     241          951570 : int CPLIsUTF8(const char* pabyData, int nLen)
     242                 : {
     243          951570 :     return CPLIsUTF8Stub( pabyData, nLen );
     244                 : }
     245                 : 
     246                 : /************************************************************************/
     247                 : /*                          CPLForceToASCII()                           */
     248                 : /************************************************************************/
     249                 : 
     250                 : /**
     251                 :  * Return a new string that is made only of ASCII characters. If non-ASCII
     252                 :  * characters are found in the input string, they will be replaced by the
     253                 :  * provided replacement character.
     254                 :  *
     255                 :  * @param pabyData input string to test
     256                 :  * @param nLen length of the input string, or -1 if the function must compute
     257                 :  *             the string length. In which case it must be null terminated.
     258                 :  * @param chReplacementChar character which will be used when the input stream
     259                 :  *                          contains a non ASCII character. Must be valid ASCII !
     260                 :  *
     261                 :  * @return a new string that must be freed with CPLFree().
     262                 :  *
     263                 :  * @since GDAL 1.7.0
     264                 :  */
     265               2 : char CPL_DLL *CPLForceToASCII(const char* pabyData, int nLen, char chReplacementChar)
     266                 : {
     267               2 :     if (nLen < 0)
     268               2 :         nLen = strlen(pabyData);
     269               2 :     char* pszOutputString = (char*)CPLMalloc(nLen + 1);
     270                 :     int i;
     271              10 :     for(i=0;i<nLen;i++)
     272                 :     {
     273               8 :         if (((unsigned char*)pabyData)[i] > 127)
     274               2 :             pszOutputString[i] = chReplacementChar;
     275                 :         else
     276               6 :             pszOutputString[i] = pabyData[i];
     277                 :     }
     278               2 :     pszOutputString[i] = '\0';
     279               2 :     return pszOutputString;
     280                 : }
     281                 : 
     282                 : /************************************************************************/
     283                 : /*                        CPLEncodingCharSize()                         */
     284                 : /************************************************************************/
     285                 : 
     286                 : /**
     287                 :  * Return bytes per character for encoding.
     288                 :  *
     289                 :  * This function returns the size in bytes of the smallest character
     290                 :  * in this encoding.  For fixed width encodings (ASCII, UCS-2, UCS-4) this
     291                 :  * is straight forward.  For encodings like UTF8 and UTF16 which represent
     292                 :  * some characters as a sequence of atomic character sizes the function
     293                 :  * still returns the atomic character size (1 for UTF8, 2 for UTF16). 
     294                 :  *
     295                 :  * This function will return the correct value for well known encodings
     296                 :  * with corresponding CPL_ENC_ values.  It may not return the correct value
     297                 :  * for other encodings even if they are supported by the underlying iconv 
     298                 :  * or windows transliteration services.  Hopefully it will improve over time.
     299                 :  *
     300                 :  * @param pszEncoding the name of the encoding.
     301                 :  *
     302                 :  * @return the size of a minimal character in bytes or -1 if the size is 
     303                 :  * unknown. 
     304                 :  */
     305                 : 
     306               0 : int CPLEncodingCharSize( const char *pszEncoding )
     307                 : 
     308                 : {
     309               0 :     if( EQUAL(pszEncoding,CPL_ENC_UTF8) )
     310               0 :         return 1;
     311               0 :     else if( EQUAL(pszEncoding,CPL_ENC_UTF16) )
     312               0 :         return 2;
     313               0 :     else if( EQUAL(pszEncoding,CPL_ENC_UCS2) )
     314               0 :         return 2;
     315               0 :     else if( EQUAL(pszEncoding,CPL_ENC_UCS4) )
     316               0 :         return 4;
     317               0 :     else if( EQUAL(pszEncoding,CPL_ENC_ASCII) )
     318               0 :         return 1;
     319               0 :     else if( EQUALN(pszEncoding,"ISO-8859-",9) )
     320               0 :         return 1;
     321                 :     else
     322               0 :         return -1;
     323                 : }
     324                 : 

Generated by: LCOV version 1.7