LCOV - gdal_filtered.info - port/cpl_recode

LCOV - code coverage report
Current view:	directory - port - cpl_recode_stub.cpp (source / functions)		Found	Hit	Coverage
Test:	gdal_filtered.info	Lines:	231	133	57.6 %
Date:	2012-12-26	Functions:	11	11	100.0 %
       1                 : /**********************************************************************
       2                 :  * $Id: cpl_recode_stub.cpp 24557 2012-06-10 10:22:49Z rouault $
       3                 :  *
       4                 :  * Name:     cpl_recode_stub.cpp
       5                 :  * Project:  CPL - Common Portability Library
       6                 :  * Purpose:  Character set recoding and char/wchar_t conversions, stub
       7                 :  *           implementation to be used if iconv() functionality is not
       8                 :  *           available.
       9                 :  * Author:   Frank Warmerdam, warmerdam@pobox.com
      10                 :  *
      11                 :  * The bulk of this code is derived from the utf.c module from FLTK. It
      12                 :  * was originally downloaded from:
      13                 :  *    http://svn.easysw.com/public/fltk/fltk/trunk/src/utf.c
      14                 :  * 
      15                 :  **********************************************************************
      16                 :  * Copyright (c) 2008, Frank Warmerdam
      17                 :  * Copyright 2006 by Bill Spitzak and others.
      18                 :  *
      19                 :  * Permission to use, copy, modify, and distribute this software for any
      20                 :  * purpose with or without fee is hereby granted, provided that the above
      21                 :  * copyright notice and this permission notice appear in all copies.
      22                 :  *
      23                 :  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
      24                 :  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
      25                 :  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
      26                 :  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
      27                 :  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
      28                 :  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
      29                 :  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
      30                 :  **********************************************************************/
      31                 : 
      32                 : #include "cpl_string.h"
      33                 : 
      34                 : CPL_CVSID("$Id: cpl_recode_stub.cpp 24557 2012-06-10 10:22:49Z rouault $");
      35                 : 
      36                 : #ifdef CPL_RECODE_STUB 
      37                 : 
      38                 : static unsigned utf8decode(const char* p, const char* end, int* len);
      39                 : static unsigned utf8towc(const char* src, unsigned srclen,
      40                 :                          wchar_t* dst, unsigned dstlen);
      41                 : static unsigned utf8toa(const char* src, unsigned srclen,
      42                 :                         char* dst, unsigned dstlen);
      43                 : static unsigned utf8fromwc(char* dst, unsigned dstlen,
      44                 :                            const wchar_t* src, unsigned srclen);
      45                 : static unsigned utf8froma(char* dst, unsigned dstlen,
      46                 :                           const char* src, unsigned srclen);
      47                 : static int utf8test(const char* src, unsigned srclen);
      48                 : 
      49                 : #ifdef _WIN32
      50                 : 
      51                 : #include <windows.h>
      52                 : #include <winnls.h>
      53                 : 
      54                 : static char* CPLWin32Recode( const char* src,
      55                 :                              unsigned src_code_page, unsigned dst_code_page );
      56                 : #endif
      57                 : 
      58                 : #ifdef FUTURE_NEEDS
      59                 : static const char* utf8fwd(const char* p, const char* start, const char* end);
      60                 : static const char* utf8back(const char* p, const char* start, const char*end);
      61                 : static int utf8encode(unsigned ucs, char* buf);
      62                 : static int utf8bytes(unsigned ucs);
      63                 : #endif /* def FUTURE_NEEDS */
      64                 : 
      65                 : /************************************************************************/
      66                 : /* ==================================================================== */
      67                 : /*  Stub Implementation not depending on iconv() or WIN32 API.  */
      68                 : /* ==================================================================== */
      69                 : /************************************************************************/
      70                 : 
      71                 : static int bHaveWarned1 = FALSE;
      72                 : static int bHaveWarned2 = FALSE;
      73                 : static int bHaveWarned3 = FALSE;
      74                 : static int bHaveWarned4 = FALSE;
      75                 : static int bHaveWarned5 = FALSE;
      76                 : static int bHaveWarned6 = FALSE;
      77                 : 
      78                 : /************************************************************************/
      79                 : /*                 CPLClearRecodeStubWarningFlags()                     */
      80                 : /************************************************************************/
      81                 : 
      82            6786 : void CPLClearRecodeStubWarningFlags()
      83                 : {
      84            6786 :     bHaveWarned1 = FALSE;
      85            6786 :     bHaveWarned2 = FALSE;
      86            6786 :     bHaveWarned3 = FALSE;
      87            6786 :     bHaveWarned4 = FALSE;
      88            6786 :     bHaveWarned5 = FALSE;
      89            6786 :     bHaveWarned6 = FALSE;
      90            6786 : }
      91                 : 
      92                 : /************************************************************************/
      93                 : /*                           CPLRecodeStub()                            */
      94                 : /************************************************************************/
      95                 : 
      96                 : /**
      97                 :  * Convert a string from a source encoding to a destination encoding.
      98                 :  *
      99                 :  * The only guaranteed supported encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
     100                 :  * and CPL_ENC_ISO8859_1. Currently, the following conversions are supported :
     101                 :  * <ul>
     102                 :  *  <li>CPL_ENC_ASCII -> CPL_ENC_UTF8 or CPL_ENC_ISO8859_1 (no conversion in fact)</li>
     103                 :  *  <li>CPL_ENC_ISO8859_1 -> CPL_ENC_UTF8</li>
     104                 :  *  <li>CPL_ENC_UTF8 -> CPL_ENC_ISO8859_1</li>
     105                 :  * </ul>
     106                 :  *
     107                 :  * If an error occurs an error may, or may not be posted with CPLError(). 
     108                 :  *
     109                 :  * @param pszSource a NULL terminated string.
     110                 :  * @param pszSrcEncoding the source encoding.
     111                 :  * @param pszDstEncoding the destination encoding.
     112                 :  *
     113                 :  * @return a NULL terminated string which should be freed with CPLFree().
     114                 :  */
     115                 : 
     116           55342 : char *CPLRecodeStub( const char *pszSource, 
     117                 :                      const char *pszSrcEncoding, 
     118                 :                      const char *pszDstEncoding )
     119                 : 
     120                 : {
     121                 : /* -------------------------------------------------------------------- */
     122                 : /*      If the source or destination is current locale(), we change     */
     123                 : /*      it to ISO8859-1 since our stub implementation does not          */
     124                 : /*      attempt to address locales properly.                            */
     125                 : /* -------------------------------------------------------------------- */
     126                 : 
     127           55342 :     if( pszSrcEncoding[0] == '\0' )
     128               0 :         pszSrcEncoding = CPL_ENC_ISO8859_1;
     129                 : 
     130           55342 :     if( pszDstEncoding[0] == '\0' )
     131               0 :         pszDstEncoding = CPL_ENC_ISO8859_1;
     132                 : 
     133                 : /* -------------------------------------------------------------------- */
     134                 : /*      ISO8859 to UTF8                                                 */
     135                 : /* -------------------------------------------------------------------- */
     136           55342 :     if( strcmp(pszSrcEncoding,CPL_ENC_ISO8859_1) == 0 
     137                 :         && strcmp(pszDstEncoding,CPL_ENC_UTF8) == 0 )
     138                 :     {
     139           47538 :         int nCharCount = strlen(pszSource);
     140           47538 :         char *pszResult = (char *) CPLCalloc(1,nCharCount*2+1);
     141                 :         
     142           47538 :         utf8froma( pszResult, nCharCount*2+1, pszSource, nCharCount );
     143                 :         
     144           47538 :         return pszResult;
     145                 :     }
     146                 : 
     147                 : /* -------------------------------------------------------------------- */
     148                 : /*      UTF8 to ISO8859                                                 */
     149                 : /* -------------------------------------------------------------------- */
     150            7804 :     if( strcmp(pszSrcEncoding,CPL_ENC_UTF8) == 0 
     151                 :         && strcmp(pszDstEncoding,CPL_ENC_ISO8859_1) == 0 )
     152                 :     {
     153            7804 :         int nCharCount = strlen(pszSource);
     154            7804 :         char *pszResult = (char *) CPLCalloc(1,nCharCount+1);
     155                 :         
     156            7804 :         utf8toa( pszSource, nCharCount, pszResult, nCharCount+1 );
     157                 :         
     158            7804 :         return pszResult;
     159                 :     }
     160                 : 
     161                 : #ifdef _WIN32
     162                 : /* ---------------------------------------------------------------------*/
     163                 : /*      CPXXX to UTF8                                                   */
     164                 : /* ---------------------------------------------------------------------*/
     165                 :     if( strncmp(pszSrcEncoding,"CP",2) == 0
     166                 :         && strcmp(pszDstEncoding,CPL_ENC_UTF8) == 0 )
     167                 :     {
     168                 :         int nCode = atoi( pszSrcEncoding + 2 );
     169                 :         if( nCode > 0 ) {
     170                 :            return CPLWin32Recode( pszSource, nCode, CP_UTF8 );
     171                 :         }
     172                 :     }
     173                 : 
     174                 : /* ---------------------------------------------------------------------*/
     175                 : /*      UTF8 to CPXXX                                                   */
     176                 : /* ---------------------------------------------------------------------*/
     177                 :     if( strcmp(pszSrcEncoding,CPL_ENC_UTF8) == 0
     178                 :         && strncmp(pszDstEncoding,"CP",2) == 0 )
     179                 :     {
     180                 :          int nCode = atoi( pszDstEncoding + 2 );
     181                 :          if( nCode > 0 ) {
     182                 :              return CPLWin32Recode( pszSource, CP_UTF8, nCode );
     183                 :          }
     184                 :     }
     185                 : #endif
     186                 : 
     187                 : /* -------------------------------------------------------------------- */
     188                 : /*      Anything else to UTF-8 is treated as ISO8859-1 to UTF-8 with    */
     189                 : /*      a one-time warning.                                             */
     190                 : /* -------------------------------------------------------------------- */
     191               0 :     if( strcmp(pszDstEncoding,CPL_ENC_UTF8) == 0 )
     192                 :     {
     193               0 :         int nCharCount = strlen(pszSource);
     194               0 :         char *pszResult = (char *) CPLCalloc(1,nCharCount*2+1);
     195                 : 
     196               0 :         if( !bHaveWarned1 )
     197                 :         {
     198               0 :             bHaveWarned1 = 1;
     199                 :             CPLError( CE_Warning, CPLE_AppDefined, 
     200                 :                       "Recode from %s to UTF-8 not supported, treated as ISO8859-1 to UTF-8.", 
     201               0 :                       pszSrcEncoding );
     202                 :         }
     203                 : 
     204               0 :         utf8froma( pszResult, nCharCount*2+1, pszSource, nCharCount );
     205                 :         
     206               0 :         return pszResult;
     207                 :     }
     208                 : 
     209                 : /* -------------------------------------------------------------------- */
     210                 : /*      UTF-8 to anything else is treated as UTF-8 to ISO-8859-1        */
     211                 : /*      with a warning.                                                 */
     212                 : /* -------------------------------------------------------------------- */
     213               0 :     if( strcmp(pszSrcEncoding,CPL_ENC_UTF8) == 0 
     214                 :         && strcmp(pszDstEncoding,CPL_ENC_ISO8859_1) == 0 )
     215                 :     {
     216               0 :         int nCharCount = strlen(pszSource);
     217               0 :         char *pszResult = (char *) CPLCalloc(1,nCharCount+1);
     218                 : 
     219               0 :         if( !bHaveWarned2 )
     220                 :         {
     221               0 :             bHaveWarned2 = 1;
     222                 :             CPLError( CE_Warning, CPLE_AppDefined, 
     223                 :                       "Recode from UTF-8 to %s not supported, treated as UTF-8 to ISO8859-1.", 
     224               0 :                       pszDstEncoding );
     225                 :         }
     226                 :         
     227               0 :         utf8toa( pszSource, nCharCount, pszResult, nCharCount+1 );
     228                 :         
     229               0 :         return pszResult;
     230                 :     }
     231                 : 
     232                 : /* -------------------------------------------------------------------- */
     233                 : /*      Everything else is treated as a no-op with a warning.           */
     234                 : /* -------------------------------------------------------------------- */
     235                 :     {
     236               0 :         if( !bHaveWarned3 )
     237                 :         {
     238               0 :             bHaveWarned3 = 1;
     239                 :             CPLError( CE_Warning, CPLE_AppDefined, 
     240                 :                       "Recode from %s to %s not supported, no change applied.", 
     241               0 :                       pszSrcEncoding, pszDstEncoding );
     242                 :         }
     243                 :         
     244               0 :         return CPLStrdup(pszSource);
     245                 :     }
     246                 : }
     247                 : 
     248                 : /************************************************************************/
     249                 : /*                       CPLRecodeFromWCharStub()                       */
     250                 : /************************************************************************/
     251                 : 
     252                 : /**
     253                 :  * Convert wchar_t string to UTF-8. 
     254                 :  *
     255                 :  * Convert a wchar_t string into a multibyte utf-8 string.  The only
     256                 :  * guaranteed supported source encoding is CPL_ENC_UCS2, and the only
     257                 :  * guaranteed supported destination encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
     258                 :  * and CPL_ENC_ISO8859_1.  In some cases (ie. using iconv()) other encodings 
     259                 :  * may also be supported.
     260                 :  *
     261                 :  * Note that the wchar_t type varies in size on different systems. On
     262                 :  * win32 it is normally 2 bytes, and on unix 4 bytes.
     263                 :  *
     264                 :  * If an error occurs an error may, or may not be posted with CPLError(). 
     265                 :  *
     266                 :  * @param pwszSource the source wchar_t string, terminated with a 0 wchar_t.
     267                 :  * @param pszSrcEncoding the source encoding, typically CPL_ENC_UCS2.
     268                 :  * @param pszDstEncoding the destination encoding, typically CPL_ENC_UTF8.
     269                 :  *
     270                 :  * @return a zero terminated multi-byte string which should be freed with 
     271                 :  * CPLFree(), or NULL if an error occurs. 
     272                 :  */
     273                 : 
     274           12605 : char *CPLRecodeFromWCharStub( const wchar_t *pwszSource, 
     275                 :                               const char *pszSrcEncoding, 
     276                 :                               const char *pszDstEncoding )
     277                 : 
     278                 : {
     279                 : /* -------------------------------------------------------------------- */
     280                 : /*      We try to avoid changes of character set.  We are just          */
     281                 : /*      providing for unicode to unicode.                               */
     282                 : /* -------------------------------------------------------------------- */
     283           12605 :     if( strcmp(pszSrcEncoding,"WCHAR_T") != 0 &&
     284                 :         strcmp(pszSrcEncoding,CPL_ENC_UTF8) != 0
     285                 :         && strcmp(pszSrcEncoding,CPL_ENC_UTF16) != 0
     286                 :         && strcmp(pszSrcEncoding,CPL_ENC_UCS2) != 0
     287                 :         && strcmp(pszSrcEncoding,CPL_ENC_UCS4) != 0 )
     288                 :     {
     289                 :         CPLError( CE_Failure, CPLE_AppDefined,
     290                 :                   "Stub recoding implementation does not support\n"
     291                 :                   "CPLRecodeFromWCharStub(...,%s,%s)", 
     292               0 :                   pszSrcEncoding, pszDstEncoding );
     293               0 :         return NULL;
     294                 :     }
     295                 : 
     296                 : /* -------------------------------------------------------------------- */
     297                 : /*      What is the source length.                                      */
     298                 : /* -------------------------------------------------------------------- */
     299           12605 :     int nSrcLen = 0;
     300                 : 
     301         4355134 :     while( pwszSource[nSrcLen] != 0 )
     302         4329924 :         nSrcLen++;
     303                 : 
     304                 : /* -------------------------------------------------------------------- */
     305                 : /*      Allocate destination buffer plenty big.                         */
     306                 : /* -------------------------------------------------------------------- */
     307                 :     char *pszResult;
     308                 :     int nDstBufSize, nDstLen;
     309                 : 
     310           12605 :     nDstBufSize = nSrcLen * 4 + 1;
     311           12605 :     pszResult = (char *) CPLMalloc(nDstBufSize); // nearly worst case.
     312                 : 
     313           12605 :     if (nSrcLen == 0)
     314                 :     {
     315               0 :         pszResult[0] = '\0';
     316               0 :         return pszResult;
     317                 :     }
     318                 : 
     319                 : /* -------------------------------------------------------------------- */
     320                 : /*      Convert, and confirm we had enough space.                       */
     321                 : /* -------------------------------------------------------------------- */
     322           12605 :     nDstLen = utf8fromwc( pszResult, nDstBufSize, pwszSource, nSrcLen );
     323           12605 :     if( nDstLen >= nDstBufSize - 1 )
     324                 :     {
     325               0 :         CPLAssert( FALSE ); // too small!
     326               0 :         return NULL;
     327                 :     }
     328                 : 
     329                 : /* -------------------------------------------------------------------- */
     330                 : /*      If something other than UTF-8 was requested, recode now.        */
     331                 : /* -------------------------------------------------------------------- */
     332           12605 :     if( strcmp(pszDstEncoding,CPL_ENC_UTF8) == 0 )
     333           12605 :         return pszResult;
     334                 : 
     335                 :     char *pszFinalResult = 
     336               0 :         CPLRecodeStub( pszResult, CPL_ENC_UTF8, pszDstEncoding );
     337                 : 
     338               0 :     CPLFree( pszResult );
     339                 :     
     340               0 :     return pszFinalResult;
     341                 : }
     342                 : 
     343                 : /************************************************************************/
     344                 : /*                        CPLRecodeToWCharStub()                        */
     345                 : /************************************************************************/
     346                 : 
     347                 : /**
     348                 :  * Convert UTF-8 string to a wchar_t string.
     349                 :  *
     350                 :  * Convert a 8bit, multi-byte per character input string into a wide
     351                 :  * character (wchar_t) string.  The only guaranteed supported source encodings
     352                 :  * are CPL_ENC_UTF8, CPL_ENC_ASCII and CPL_ENC_ISO8869_1 (LATIN1).  The only
     353                 :  * guaranteed supported destination encoding is CPL_ENC_UCS2.  Other source
     354                 :  * and destination encodings may be supported depending on the underlying
     355                 :  * implementation. 
     356                 :  *
     357                 :  * Note that the wchar_t type varies in size on different systems. On
     358                 :  * win32 it is normally 2 bytes, and on unix 4 bytes.
     359                 :  *
     360                 :  * If an error occurs an error may, or may not be posted with CPLError(). 
     361                 :  *
     362                 :  * @param pszSource input multi-byte character string.
     363                 :  * @param pszSrcEncoding source encoding, typically CPL_ENC_UTF8.
     364                 :  * @param pszDstEncoding destination encoding, typically CPL_ENC_UCS2. 
     365                 :  *
     366                 :  * @return the zero terminated wchar_t string (to be freed with CPLFree()) or
     367                 :  * NULL on error.
     368                 :  *
     369                 :  * @since GDAL 1.6.0
     370                 :  */
     371                 : 
     372            2933 : wchar_t *CPLRecodeToWCharStub( const char *pszSource,
     373                 :                                const char *pszSrcEncoding, 
     374                 :                                const char *pszDstEncoding )
     375                 : 
     376                 : {
     377            2933 :     char *pszUTF8Source = (char *) pszSource;
     378                 : 
     379            2933 :     if( strcmp(pszSrcEncoding,CPL_ENC_UTF8) != 0 
     380                 :         && strcmp(pszSrcEncoding,CPL_ENC_ASCII) != 0 )
     381                 :     {
     382               0 :         pszUTF8Source = CPLRecodeStub( pszSource, pszSrcEncoding, CPL_ENC_UTF8 );
     383               0 :         if( pszUTF8Source == NULL )
     384               0 :             return NULL;
     385                 :     }
     386                 : 
     387                 : /* -------------------------------------------------------------------- */
     388                 : /*      We try to avoid changes of character set.  We are just          */
     389                 : /*      providing for unicode to unicode.                               */
     390                 : /* -------------------------------------------------------------------- */
     391            2933 :     if( strcmp(pszDstEncoding,"WCHAR_T") != 0
     392                 :         && strcmp(pszDstEncoding,CPL_ENC_UCS2) != 0
     393                 :         && strcmp(pszDstEncoding,CPL_ENC_UCS4) != 0 
     394                 :         && strcmp(pszDstEncoding,CPL_ENC_UTF16) != 0 )
     395                 :     {
     396                 :         CPLError( CE_Failure, CPLE_AppDefined,
     397                 :                   "Stub recoding implementation does not support\n"
     398                 :                   "CPLRecodeToWCharStub(...,%s,%s)", 
     399               0 :                   pszSrcEncoding, pszDstEncoding );
     400               0 :         return NULL;
     401                 :     }
     402                 : 
     403                 : /* -------------------------------------------------------------------- */
     404                 : /*      Do the UTF-8 to UCS-2 recoding.                                 */
     405                 : /* -------------------------------------------------------------------- */
     406            2933 :     int nSrcLen = strlen(pszUTF8Source);
     407            2933 :     wchar_t *pwszResult = (wchar_t *) CPLCalloc(sizeof(wchar_t),nSrcLen+1);
     408                 : 
     409            2933 :     utf8towc( pszUTF8Source, nSrcLen, pwszResult, nSrcLen+1 );
     410                 : 
     411            2933 :     if( pszUTF8Source != pszSource )
     412               0 :         CPLFree( pszUTF8Source );
     413                 : 
     414            2933 :     return pwszResult;
     415                 : }
     416                 : 
     417                 : 
     418                 : /************************************************************************/
     419                 : /*                                 CPLIsUTF8()                          */
     420                 : /************************************************************************/
     421                 : 
     422                 : /**
     423                 :  * Test if a string is encoded as UTF-8.
     424                 :  *
     425                 :  * @param pabyData input string to test
     426                 :  * @param nLen length of the input string, or -1 if the function must compute
     427                 :  *             the string length. In which case it must be null terminated.
     428                 :  * @return TRUE if the string is encoded as UTF-8. FALSE otherwise
     429                 :  *
     430                 :  * @since GDAL 1.7.0
     431                 :  */
     432          476054 : int CPLIsUTF8Stub(const char* pabyData, int nLen)
     433                 : {
     434          476054 :     if (nLen < 0)
     435          476054 :         nLen = strlen(pabyData);
     436          476054 :     return utf8test(pabyData, (unsigned)nLen) != 0;
     437                 : }
     438                 : 
     439                 : /************************************************************************/
     440                 : /* ==================================================================== */
     441                 : /*  UTF.C code from FLTK with some modifications.                   */
     442                 : /* ==================================================================== */
     443                 : /************************************************************************/
     444                 : 
     445                 : /* Set to 1 to turn bad UTF8 bytes into ISO-8859-1. If this is to zero
     446                 :    they are instead turned into the Unicode REPLACEMENT CHARACTER, of
     447                 :    value 0xfffd.
     448                 :    If this is on utf8decode will correctly map most (perhaps all)
     449                 :    human-readable text that is in ISO-8859-1. This may allow you
     450                 :    to completely ignore character sets in your code because virtually
     451                 :    everything is either ISO-8859-1 or UTF-8.
     452                 : */
     453                 : #define ERRORS_TO_ISO8859_1 1
     454                 : 
     455                 : /* Set to 1 to turn bad UTF8 bytes in the 0x80-0x9f range into the
     456                 :    Unicode index for Microsoft's CP1252 character set. You should
     457                 :    also set ERRORS_TO_ISO8859_1. With this a huge amount of more
     458                 :    available text (such as all web pages) are correctly converted
     459                 :    to Unicode.
     460                 : */
     461                 : #define ERRORS_TO_CP1252 1
     462                 : 
     463                 : /* A number of Unicode code points are in fact illegal and should not
     464                 :    be produced by a UTF-8 converter. Turn this on will replace the
     465                 :    bytes in those encodings with errors. If you do this then converting
     466                 :    arbitrary 16-bit data to UTF-8 and then back is not an identity,
     467                 :    which will probably break a lot of software.
     468                 : */
     469                 : #define STRICT_RFC3629 0
     470                 : 
     471                 : #if ERRORS_TO_CP1252
     472                 : // Codes 0x80..0x9f from the Microsoft CP1252 character set, translated
     473                 : // to Unicode:
     474                 : static unsigned short cp1252[32] = {
     475                 :   0x20ac, 0x0081, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,
     476                 :   0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008d, 0x017d, 0x008f,
     477                 :   0x0090, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
     478                 :   0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x009d, 0x017e, 0x0178
     479                 : };
     480                 : #endif
     481                 : 
     482                 : /************************************************************************/
     483                 : /*                             utf8decode()                             */
     484                 : /************************************************************************/
     485                 : 
     486                 : /*
     487                 :     Decode a single UTF-8 encoded character starting at \e p. The
     488                 :     resulting Unicode value (in the range 0-0x10ffff) is returned,
     489                 :     and \e len is set the the number of bytes in the UTF-8 encoding
     490                 :     (adding \e len to \e p will point at the next character).
     491                 : 
     492                 :     If \a p points at an illegal UTF-8 encoding, including one that
     493                 :     would go past \e end, or where a code is uses more bytes than
     494                 :     necessary, then *(unsigned char*)p is translated as though it is
     495                 :     in the Microsoft CP1252 character set and \e len is set to 1.
     496                 :     Treating errors this way allows this to decode almost any
     497                 :     ISO-8859-1 or CP1252 text that has been mistakenly placed where
     498                 :     UTF-8 is expected, and has proven very useful.
     499                 : 
     500                 :     If you want errors to be converted to error characters (as the
     501                 :     standards recommend), adding a test to see if the length is
     502                 :     unexpectedly 1 will work:
     503                 : 
     504                 : \code
     505                 :     if (*p & 0x80) { // what should be a multibyte encoding
     506                 :       code = utf8decode(p,end,&len);
     507                 :       if (len<2) code = 0xFFFD; // Turn errors into REPLACEMENT CHARACTER
     508                 :     } else { // handle the 1-byte utf8 encoding:
     509                 :       code = *p;
     510                 :       len = 1;
     511                 :     }
     512                 : \endcode
     513                 : 
     514                 :     Direct testing for the 1-byte case (as shown above) will also
     515                 :     speed up the scanning of strings where the majority of characters
     516                 :     are ASCII.
     517                 : */
     518             243 : static unsigned utf8decode(const char* p, const char* end, int* len)
     519                 : {
     520             243 :   unsigned char c = *(unsigned char*)p;
     521             243 :   if (c < 0x80) {
     522               0 :     *len = 1;
     523               0 :     return c;
     524                 : #if ERRORS_TO_CP1252
     525             243 :   } else if (c < 0xa0) {
     526               1 :     *len = 1;
     527               1 :     return cp1252[c-0x80];
     528                 : #endif
     529             242 :   } else if (c < 0xc2) {
     530               0 :     goto FAIL;
     531                 :   }
     532             242 :   if (p+1 >= end || (p[1]&0xc0) != 0x80) goto FAIL;
     533             242 :   if (c < 0xe0) {
     534             238 :     *len = 2;
     535                 :     return
     536             238 :       ((p[0] & 0x1f) << 6) +
     537             238 :       ((p[1] & 0x3f));
     538               4 :   } else if (c == 0xe0) {
     539               0 :     if (((unsigned char*)p)[1] < 0xa0) goto FAIL;
     540               0 :     goto UTF8_3;
     541                 : #if STRICT_RFC3629
     542                 :   } else if (c == 0xed) {
     543                 :     // RFC 3629 says surrogate chars are illegal.
     544                 :     if (((unsigned char*)p)[1] >= 0xa0) goto FAIL;
     545                 :     goto UTF8_3;
     546                 :   } else if (c == 0xef) {
     547                 :     // 0xfffe and 0xffff are also illegal characters
     548                 :     if (((unsigned char*)p)[1]==0xbf &&
     549                 :   ((unsigned char*)p)[2]>=0xbe) goto FAIL;
     550                 :     goto UTF8_3;
     551                 : #endif
     552               4 :   } else if (c < 0xf0) {
     553                 :   UTF8_3:
     554               4 :     if (p+2 >= end || (p[2]&0xc0) != 0x80) goto FAIL;
     555               4 :     *len = 3;
     556                 :     return
     557               4 :       ((p[0] & 0x0f) << 12) +
     558               4 :       ((p[1] & 0x3f) << 6) +
     559               8 :       ((p[2] & 0x3f));
     560               0 :   } else if (c == 0xf0) {
     561               0 :     if (((unsigned char*)p)[1] < 0x90) goto FAIL;
     562               0 :     goto UTF8_4;
     563               0 :   } else if (c < 0xf4) {
     564                 :   UTF8_4:
     565               0 :     if (p+3 >= end || (p[2]&0xc0) != 0x80 || (p[3]&0xc0) != 0x80) goto FAIL;
     566               0 :     *len = 4;
     567                 : #if STRICT_RFC3629
     568                 :     // RFC 3629 says all codes ending in fffe or ffff are illegal:
     569                 :     if ((p[1]&0xf)==0xf &&
     570                 :   ((unsigned char*)p)[2] == 0xbf &&
     571                 :   ((unsigned char*)p)[3] >= 0xbe) goto FAIL;
     572                 : #endif
     573                 :     return
     574               0 :       ((p[0] & 0x07) << 18) +
     575               0 :       ((p[1] & 0x3f) << 12) +
     576               0 :       ((p[2] & 0x3f) << 6) +
     577               0 :       ((p[3] & 0x3f));
     578               0 :   } else if (c == 0xf4) {
     579               0 :     if (((unsigned char*)p)[1] > 0x8f) goto FAIL; // after 0x10ffff
     580               0 :     goto UTF8_4;
     581                 :   } else {
     582                 :   FAIL:
     583               0 :     *len = 1;
     584                 : #if ERRORS_TO_ISO8859_1
     585               0 :     return c;
     586                 : #else
     587                 :     return 0xfffd; // Unicode REPLACEMENT CHARACTER
     588                 : #endif
     589                 :   }
     590                 : }
     591                 : 
     592                 : /************************************************************************/
     593                 : /*                              utf8fwd()                               */
     594                 : /************************************************************************/
     595                 : 
     596                 : /*
     597                 :   Move \a p forward until it points to the start of a UTF-8
     598                 :   character. If it already points at the start of one then it
     599                 :   is returned unchanged. Any UTF-8 errors are treated as though each
     600                 :   byte of the error is an individual character.
     601                 : 
     602                 :   \e start is the start of the string and is used to limit the
     603                 :   backwards search for the start of a utf8 character.
     604                 : 
     605                 :   \e end is the end of the string and is assummed to be a break
     606                 :   between characters. It is assummed to be greater than p.
     607                 : 
     608                 :   This function is for moving a pointer that was jumped to the
     609                 :   middle of a string, such as when doing a binary search for
     610                 :   a position. You should use either this or utf8back() depending
     611                 :   on which direction your algorithim can handle the pointer
     612                 :   moving. Do not use this to scan strings, use utf8decode()
     613                 :   instead.
     614                 : */
     615                 : 
     616                 : #ifdef FUTURE_NEEDS
     617                 : static const char* utf8fwd(const char* p, const char* start, const char* end)
     618                 : {
     619                 :   const char* a;
     620                 :   int len;
     621                 :   // if we are not pointing at a continuation character, we are done:
     622                 :   if ((*p&0xc0) != 0x80) return p;
     623                 :   // search backwards for a 0xc0 starting the character:
     624                 :   for (a = p-1; ; --a) {
     625                 :     if (a < start) return p;
     626                 :     if (!(a[0]&0x80)) return p;
     627                 :     if ((a[0]&0x40)) break;
     628                 :   }
     629                 :   utf8decode(a,end,&len);
     630                 :   a += len;
     631                 :   if (a > p) return a;
     632                 :   return p;
     633                 : }
     634                 : #endif /* def FUTURE_NEEDS */
     635                 : 
     636                 : /************************************************************************/
     637                 : /*                              utf8back()                              */
     638                 : /************************************************************************/
     639                 : 
     640                 : /*
     641                 :   Move \a p backward until it points to the start of a UTF-8
     642                 :   character. If it already points at the start of one then it
     643                 :   is returned unchanged. Any UTF-8 errors are treated as though each
     644                 :   byte of the error is an individual character.
     645                 : 
     646                 :   \e start is the start of the string and is used to limit the
     647                 :   backwards search for the start of a UTF-8 character.
     648                 : 
     649                 :   \e end is the end of the string and is assummed to be a break
     650                 :   between characters. It is assummed to be greater than p.
     651                 : 
     652                 :   If you wish to decrement a UTF-8 pointer, pass p-1 to this.
     653                 : */
     654                 : 
     655                 : #ifdef FUTURE_NEEDS
     656                 : static const char* utf8back(const char* p, const char* start, const char* end)
     657                 : {
     658                 :   const char* a;
     659                 :   int len;
     660                 :   // if we are not pointing at a continuation character, we are done:
     661                 :   if ((*p&0xc0) != 0x80) return p;
     662                 :   // search backwards for a 0xc0 starting the character:
     663                 :   for (a = p-1; ; --a) {
     664                 :     if (a < start) return p;
     665                 :     if (!(a[0]&0x80)) return p;
     666                 :     if ((a[0]&0x40)) break;
     667                 :   }
     668                 :   utf8decode(a,end,&len);
     669                 :   if (a+len > p) return a;
     670                 :   return p;
     671                 : }
     672                 : #endif /* def FUTURE_NEEDS */
     673                 : 
     674                 : /************************************************************************/
     675                 : /*                             utf8bytes()                              */
     676                 : /************************************************************************/
     677                 : 
     678                 : /* Returns number of bytes that utf8encode() will use to encode the
     679                 :   character \a ucs. */
     680                 : #ifdef FUTURE_NEEDS
     681                 : static int utf8bytes(unsigned ucs) {
     682                 :   if (ucs < 0x000080U) {
     683                 :     return 1;
     684                 :   } else if (ucs < 0x000800U) {
     685                 :     return 2;
     686                 :   } else if (ucs < 0x010000U) {
     687                 :     return 3;
     688                 :   } else if (ucs < 0x10ffffU) {
     689                 :     return 4;
     690                 :   } else {
     691                 :     return 3; // length of the illegal character encoding
     692                 :   }
     693                 : }
     694                 : #endif /* def FUTURE_NEEDS */
     695                 : 
     696                 : /************************************************************************/
     697                 : /*                             utf8encode()                             */
     698                 : /************************************************************************/
     699                 : 
     700                 : /* Write the UTF-8 encoding of \e ucs into \e buf and return the
     701                 :     number of bytes written. Up to 4 bytes may be written. If you know
     702                 :     that \a ucs is less than 0x10000 then at most 3 bytes will be written.
     703                 :     If you wish to speed this up, remember that anything less than 0x80
     704                 :     is written as a single byte.
     705                 : 
     706                 :     If ucs is greater than 0x10ffff this is an illegal character
     707                 :     according to RFC 3629. These are converted as though they are
     708                 :     0xFFFD (REPLACEMENT CHARACTER).
     709                 : 
     710                 :     RFC 3629 also says many other values for \a ucs are illegal (in
     711                 :     the range 0xd800 to 0xdfff, or ending with 0xfffe or
     712                 :     0xffff). However I encode these as though they are legal, so that
     713                 :     utf8encode/utf8decode will be the identity for all codes between 0
     714                 :     and 0x10ffff.
     715                 : */
     716                 : #ifdef FUTURE_NEEDS
     717                 : static int utf8encode(unsigned ucs, char* buf) {
     718                 :   if (ucs < 0x000080U) {
     719                 :     buf[0] = ucs;
     720                 :     return 1;
     721                 :   } else if (ucs < 0x000800U) {
     722                 :     buf[0] = 0xc0 | (ucs >> 6);
     723                 :     buf[1] = 0x80 | (ucs & 0x3F);
     724                 :     return 2;
     725                 :   } else if (ucs < 0x010000U) {
     726                 :     buf[0] = 0xe0 | (ucs >> 12);
     727                 :     buf[1] = 0x80 | ((ucs >> 6) & 0x3F);
     728                 :     buf[2] = 0x80 | (ucs & 0x3F);
     729                 :     return 3;
     730                 :   } else if (ucs < 0x0010ffffU) {
     731                 :     buf[0] = 0xf0 | (ucs >> 18);
     732                 :     buf[1] = 0x80 | ((ucs >> 12) & 0x3F);
     733                 :     buf[2] = 0x80 | ((ucs >> 6) & 0x3F);
     734                 :     buf[3] = 0x80 | (ucs & 0x3F);
     735                 :     return 4;
     736                 :   } else {
     737                 :     // encode 0xfffd:
     738                 :     buf[0] = 0xefU;
     739                 :     buf[1] = 0xbfU;
     740                 :     buf[2] = 0xbdU;
     741                 :     return 3;
     742                 :   }
     743                 : }
     744                 : #endif /* def FUTURE_NEEDS */
     745                 : 
     746                 : /************************************************************************/
     747                 : /*                              utf8towc()                              */
     748                 : /************************************************************************/
     749                 : 
     750                 : /*  Convert a UTF-8 sequence into an array of wchar_t. These
     751                 :     are used by some system calls, especially on Windows.
     752                 : 
     753                 :     \a src points at the UTF-8, and \a srclen is the number of bytes to
     754                 :     convert.
     755                 : 
     756                 :     \a dst points at an array to write, and \a dstlen is the number of
     757                 :     locations in this array. At most \a dstlen-1 words will be
     758                 :     written there, plus a 0 terminating word. Thus this function
     759                 :     will never overwrite the buffer and will always return a
     760                 :     zero-terminated string. If \a dstlen is zero then \a dst can be
     761                 :     null and no data is written, but the length is returned.
     762                 : 
     763                 :     The return value is the number of words that \e would be written
     764                 :     to \a dst if it were long enough, not counting the terminating
     765                 :     zero. If the return value is greater or equal to \a dstlen it
     766                 :     indicates truncation, you can then allocate a new array of size
     767                 :     return+1 and call this again.
     768                 : 
     769                 :     Errors in the UTF-8 are converted as though each byte in the
     770                 :     erroneous string is in the Microsoft CP1252 encoding. This allows
     771                 :     ISO-8859-1 text mistakenly identified as UTF-8 to be printed
     772                 :     correctly.
     773                 : 
     774                 :     Notice that sizeof(wchar_t) is 2 on Windows and is 4 on Linux
     775                 :     and most other systems. Where wchar_t is 16 bits, Unicode
     776                 :     characters in the range 0x10000 to 0x10ffff are converted to
     777                 :     "surrogate pairs" which take two words each (this is called UTF-16
     778                 :     encoding). If wchar_t is 32 bits this rather nasty problem is
     779                 :     avoided.
     780                 : */
     781            2933 : static unsigned utf8towc(const char* src, unsigned srclen,
     782                 :                          wchar_t* dst, unsigned dstlen)
     783                 : {
     784            2933 :   const char* p = src;
     785            2933 :   const char* e = src+srclen;
     786            2933 :   unsigned count = 0;
     787           15241 :   if (dstlen) for (;;) {
     788           15241 :     if (p >= e) {dst[count] = 0; return count;}
     789           12308 :     if (!(*p & 0x80)) { // ascii
     790           12160 :       dst[count] = *p++;
     791                 :     } else {
     792             148 :       int len; unsigned ucs = utf8decode(p,e,&len);
     793             148 :       p += len;
     794                 : #ifdef _WIN32
     795                 :       if (ucs < 0x10000) {
     796                 :           dst[count] = (wchar_t)ucs;
     797                 :       } else {
     798                 :   // make a surrogate pair:
     799                 :   if (count+2 >= dstlen) {dst[count] = 0; count += 2; break;}
     800                 :         dst[count] = (wchar_t)((((ucs-0x10000u)>>10)&0x3ff) | 0xd800);
     801                 :         dst[++count] = (wchar_t)((ucs&0x3ff) | 0xdc00);
     802                 :       }
     803                 : #else
     804             148 :       dst[count] = (wchar_t)ucs;
     805                 : #endif
     806                 :     }
     807           12308 :     if (++count == dstlen) {dst[count-1] = 0; break;}
     808                 :   }
     809                 :   // we filled dst, measure the rest:
     810               0 :   while (p < e) {
     811               0 :     if (!(*p & 0x80)) p++;
     812                 :     else {
     813                 : #ifdef _WIN32
     814                 :       int len; unsigned ucs = utf8decode(p,e,&len);
     815                 :       p += len;
     816                 :       if (ucs >= 0x10000) ++count;
     817                 : #else
     818               0 :       int len; utf8decode(p,e,&len);
     819               0 :       p += len;
     820                 : #endif
     821                 :     }
     822               0 :     ++count;
     823                 :   }
     824               0 :   return count;
     825                 : }
     826                 : 
     827                 : /************************************************************************/
     828                 : /*                              utf8toa()                               */
     829                 : /************************************************************************/
     830                 : /* Convert a UTF-8 sequence into an array of 1-byte characters.
     831                 : 
     832                 :     If the UTF-8 decodes to a character greater than 0xff then it is
     833                 :     replaced with '?'.
     834                 : 
     835                 :     Errors in the UTF-8 are converted as individual bytes, same as
     836                 :     utf8decode() does. This allows ISO-8859-1 text mistakenly identified
     837                 :     as UTF-8 to be printed correctly (and possibly CP1512 on Windows).
     838                 : 
     839                 :     \a src points at the UTF-8, and \a srclen is the number of bytes to
     840                 :     convert.
     841                 : 
     842                 :     Up to \a dstlen bytes are written to \a dst, including a null
     843                 :     terminator. The return value is the number of bytes that would be
     844                 :     written, not counting the null terminator. If greater or equal to
     845                 :     \a dstlen then if you malloc a new array of size n+1 you will have
     846                 :     the space needed for the entire string. If \a dstlen is zero then
     847                 :     nothing is written and this call just measures the storage space
     848                 :     needed.
     849                 : */
     850            7804 : static unsigned utf8toa(const char* src, unsigned srclen,
     851                 :                         char* dst, unsigned dstlen)
     852                 : {
     853            7804 :   const char* p = src;
     854            7804 :   const char* e = src+srclen;
     855            7804 :   unsigned count = 0;
     856           58894 :   if (dstlen) for (;;) {
     857                 :     unsigned char c;
     858           58894 :     if (p >= e) {dst[count] = 0; return count;}
     859           51090 :     c = *(unsigned char*)p;
     860           51090 :     if (c < 0xC2) { // ascii or bad code
     861           51077 :       dst[count] = c;
     862           51077 :       p++;
     863                 :     } else {
     864              13 :       int len; unsigned ucs = utf8decode(p,e,&len);
     865              13 :       p += len;
     866              13 :       if (ucs < 0x100) dst[count] = (char)ucs;
     867                 :       else
     868                 :       {
     869               4 :           if (!bHaveWarned4)
     870                 :           {
     871               2 :               bHaveWarned4 = TRUE;
     872                 :               CPLError(CE_Warning, CPLE_AppDefined,
     873                 :                        "One or several characters couldn't be converted correctly from UTF-8 to ISO-8859-1.\n"
     874               2 :                        "This warning will not be emitted anymore.");
     875                 :           }
     876               4 :           dst[count] = '?';
     877                 :       }
     878                 :     }
     879           51090 :     if (++count >= dstlen) {dst[count-1] = 0; break;}
     880                 :   }
     881                 :   // we filled dst, measure the rest:
     882               0 :   while (p < e) {
     883               0 :     if (!(*p & 0x80)) p++;
     884                 :     else {
     885                 :       int len;
     886               0 :       utf8decode(p,e,&len);
     887               0 :       p += len;
     888                 :     }
     889               0 :     ++count;
     890                 :   }
     891               0 :   return count;
     892                 : }
     893                 : 
     894                 : /************************************************************************/
     895                 : /*                             utf8fromwc()                             */
     896                 : /************************************************************************/
     897                 : /* Turn "wide characters" as returned by some system calls
     898                 :     (especially on Windows) into UTF-8.
     899                 : 
     900                 :     Up to \a dstlen bytes are written to \a dst, including a null
     901                 :     terminator. The return value is the number of bytes that would be
     902                 :     written, not counting the null terminator. If greater or equal to
     903                 :     \a dstlen then if you malloc a new array of size n+1 you will have
     904                 :     the space needed for the entire string. If \a dstlen is zero then
     905                 :     nothing is written and this call just measures the storage space
     906                 :     needed.
     907                 : 
     908                 :     \a srclen is the number of words in \a src to convert. On Windows
     909                 :     this is not necessairly the number of characters, due to there
     910                 :     possibly being "surrogate pairs" in the UTF-16 encoding used.
     911                 :     On Unix wchar_t is 32 bits and each location is a character.
     912                 : 
     913                 :     On Unix if a src word is greater than 0x10ffff then this is an
     914                 :     illegal character according to RFC 3629. These are converted as
     915                 :     though they are 0xFFFD (REPLACEMENT CHARACTER). Characters in the
     916                 :     range 0xd800 to 0xdfff, or ending with 0xfffe or 0xffff are also
     917                 :     illegal according to RFC 3629. However I encode these as though
     918                 :     they are legal, so that utf8towc will return the original data.
     919                 : 
     920                 :     On Windows "surrogate pairs" are converted to a single character
     921                 :     and UTF-8 encoded (as 4 bytes). Mismatched halves of surrogate
     922                 :     pairs are converted as though they are individual characters.
     923                 : */
     924           12605 : static unsigned utf8fromwc(char* dst, unsigned dstlen,
     925                 :                            const wchar_t* src, unsigned srclen) {
     926           12605 :   unsigned i = 0;
     927           12605 :   unsigned count = 0;
     928         4342529 :   if (dstlen) for (;;) {
     929                 :     unsigned ucs;
     930         4342529 :     if (i >= srclen) {dst[count] = 0; return count;}
     931         4329924 :     ucs = src[i++];
     932         4329924 :     if (ucs < 0x80U) {
     933         4328889 :       dst[count++] = (char)ucs;
     934         4328889 :       if (count >= dstlen) {dst[count-1] = 0; break;}
     935            1035 :     } else if (ucs < 0x800U) { // 2 bytes
     936            1035 :       if (count+2 >= dstlen) {dst[count] = 0; count += 2; break;}
     937            1035 :       dst[count++] = 0xc0 | (char)(ucs >> 6);
     938            1035 :       dst[count++] = 0x80 | (char)(ucs & 0x3F);
     939                 : #ifdef _WIN32
     940                 :     } else if (ucs >= 0xd800 && ucs <= 0xdbff && i < srclen &&
     941                 :          src[i] >= 0xdc00 && src[i] <= 0xdfff) {
     942                 :       // surrogate pair
     943                 :       unsigned ucs2 = src[i++];
     944                 :       ucs = 0x10000U + ((ucs&0x3ff)<<10) + (ucs2&0x3ff);
     945                 :       // all surrogate pairs turn into 4-byte utf8
     946                 : #else
     947               0 :     } else if (ucs >= 0x10000) {
     948               0 :       if (ucs > 0x10ffff) {
     949               0 :   ucs = 0xfffd;
     950               0 :   goto J1;
     951                 :       }
     952                 : #endif
     953               0 :       if (count+4 >= dstlen) {dst[count] = 0; count += 4; break;}
     954               0 :       dst[count++] = 0xf0 | (char)(ucs >> 18);
     955               0 :       dst[count++] = 0x80 | (char)((ucs >> 12) & 0x3F);
     956               0 :       dst[count++] = 0x80 | (char)((ucs >> 6) & 0x3F);
     957               0 :       dst[count++] = 0x80 | (char)(ucs & 0x3F);
     958                 :     } else {
     959                 : #ifndef _WIN32
     960                 :     J1:
     961                 : #endif
     962                 :       // all others are 3 bytes:
     963               0 :       if (count+3 >= dstlen) {dst[count] = 0; count += 3; break;}
     964               0 :       dst[count++] = 0xe0 | (char)(ucs >> 12);
     965               0 :       dst[count++] = 0x80 | (char)((ucs >> 6) & 0x3F);
     966               0 :       dst[count++] = 0x80 | (char)(ucs & 0x3F);
     967                 :     }
     968                 :   }
     969                 :   // we filled dst, measure the rest:
     970               0 :   while (i < srclen) {
     971               0 :     unsigned ucs = src[i++];
     972               0 :     if (ucs < 0x80U) {
     973               0 :       count++;
     974               0 :     } else if (ucs < 0x800U) { // 2 bytes
     975               0 :       count += 2;
     976                 : #ifdef _WIN32
     977                 :     } else if (ucs >= 0xd800 && ucs <= 0xdbff && i < srclen-1 &&
     978                 :          src[i+1] >= 0xdc00 && src[i+1] <= 0xdfff) {
     979                 :       // surrogate pair
     980                 :       ++i;
     981                 : #else
     982               0 :     } else if (ucs >= 0x10000 && ucs <= 0x10ffff) {
     983                 : #endif
     984               0 :       count += 4;
     985                 :     } else {
     986               0 :       count += 3;
     987                 :     }
     988                 :   }
     989               0 :   return count;
     990                 : }
     991                 : 
     992                 : 
     993                 : /************************************************************************/
     994                 : /*                             utf8froma()                              */
     995                 : /************************************************************************/
     996                 : 
     997                 : /* Convert an ISO-8859-1 (ie normal c-string) byte stream to UTF-8.
     998                 : 
     999                 :     It is possible this should convert Microsoft's CP1252 to UTF-8
    1000                 :     instead. This would translate the codes in the range 0x80-0x9f
    1001                 :     to different characters. Currently it does not do this.
    1002                 : 
    1003                 :     Up to \a dstlen bytes are written to \a dst, including a null
    1004                 :     terminator. The return value is the number of bytes that would be
    1005                 :     written, not counting the null terminator. If greater or equal to
    1006                 :     \a dstlen then if you malloc a new array of size n+1 you will have
    1007                 :     the space needed for the entire string. If \a dstlen is zero then
    1008                 :     nothing is written and this call just measures the storage space
    1009                 :     needed.
    1010                 : 
    1011                 :     \a srclen is the number of bytes in \a src to convert.
    1012                 : 
    1013                 :     If the return value equals \a srclen then this indicates that
    1014                 :     no conversion is necessary, as only ASCII characters are in the
    1015                 :     string.
    1016                 : */
    1017           47538 : static unsigned utf8froma(char* dst, unsigned dstlen,
    1018                 :                           const char* src, unsigned srclen) {
    1019           47538 :   const char* p = src;
    1020           47538 :   const char* e = src+srclen;
    1021           47538 :   unsigned count = 0;
    1022          879458 :   if (dstlen) for (;;) {
    1023                 :     unsigned char ucs;
    1024          879458 :     if (p >= e) {dst[count] = 0; return count;}
    1025          831920 :     ucs = *(unsigned char*)p++;
    1026          831920 :     if (ucs < 0x80U) {
    1027          831888 :       dst[count++] = ucs;
    1028          831888 :       if (count >= dstlen) {dst[count-1] = 0; break;}
    1029                 :     } else { // 2 bytes (note that CP1252 translate could make 3 bytes!)
    1030              32 :       if (count+2 >= dstlen) {dst[count] = 0; count += 2; break;}
    1031              32 :       dst[count++] = 0xc0 | (ucs >> 6);
    1032              32 :       dst[count++] = 0x80 | (ucs & 0x3F);
    1033                 :     }
    1034                 :   }
    1035                 :   // we filled dst, measure the rest:
    1036               0 :   while (p < e) {
    1037               0 :     unsigned char ucs = *(unsigned char*)p++;
    1038               0 :     if (ucs < 0x80U) {
    1039               0 :       count++;
    1040                 :     } else {
    1041               0 :       count += 2;
    1042                 :     }
    1043                 :   }
    1044               0 :   return count;
    1045                 : }
    1046                 : 
    1047                 : #ifdef _WIN32
    1048                 : 
    1049                 : /************************************************************************/
    1050                 : /*                            CPLWin32Recode()                          */
    1051                 : /************************************************************************/
    1052                 : 
    1053                 : /* Convert an CODEPAGE (ie normal c-string) byte stream
    1054                 :      to another CODEPAGE (ie normal c-string) byte stream.
    1055                 : 
    1056                 :     \a src is target c-string byte stream (including a null terminator).
    1057                 :     \a src_code_page is target c-string byte code page.
    1058                 :     \a dst_code_page is destination c-string byte code page.
    1059                 : 
    1060                 :    UTF7          65000
    1061                 :    UTF8          65001
    1062                 :    OEM-US          437
    1063                 :    OEM-ALABIC      720
    1064                 :    OEM-GREEK       737
    1065                 :    OEM-BALTIC      775
    1066                 :    OEM-MLATIN1     850
    1067                 :    OEM-LATIN2      852
    1068                 :    OEM-CYRILLIC    855
    1069                 :    OEM-TURKISH     857
    1070                 :    OEM-MLATIN1P    858
    1071                 :    OEM-HEBREW      862
    1072                 :    OEM-RUSSIAN     866
    1073                 : 
    1074                 :    THAI            874
    1075                 :    SJIS            932
    1076                 :    GBK             936
    1077                 :    KOREA           949
    1078                 :    BIG5            950
    1079                 : 
    1080                 :    EUROPE         1250
    1081                 :    CYRILLIC       1251
    1082                 :    LATIN1         1252
    1083                 :    GREEK          1253
    1084                 :    TURKISH        1254
    1085                 :    HEBREW         1255
    1086                 :    ARABIC         1256
    1087                 :    BALTIC         1257
    1088                 :    VIETNAM        1258
    1089                 : 
    1090                 :    ISO-LATIN1    28591
    1091                 :    ISO-LATIN2    28592
    1092                 :    ISO-LATIN3    28593
    1093                 :    ISO-BALTIC    28594
    1094                 :    ISO-CYRILLIC  28595
    1095                 :    ISO-ARABIC    28596
    1096                 :    ISO-HEBREW    28598
    1097                 :    ISO-TURKISH   28599
    1098                 :    ISO-LATIN9    28605
    1099                 : 
    1100                 :    ISO-2022-JP   50220
    1101                 : 
    1102                 : */
    1103                 : 
    1104                 : char* CPLWin32Recode( const char* src, unsigned src_code_page, unsigned dst_code_page )
    1105                 : {
    1106                 :     /* Convert from source code page to Unicode */
    1107                 : 
    1108                 :     /* Compute the length in wide characters */
    1109                 :     int wlen = MultiByteToWideChar( src_code_page, MB_ERR_INVALID_CHARS, src, -1, 0, 0 );
    1110                 :     if (wlen == 0 && GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
    1111                 :     {
    1112                 :         if (!bHaveWarned5)
    1113                 :         {
    1114                 :             bHaveWarned5 = TRUE;
    1115                 :             CPLError(CE_Warning, CPLE_AppDefined,
    1116                 :                     "One or several characters could not be translated from CP%d. "
    1117                 :                     "This warning will not be emitted anymore.", src_code_page);
    1118                 :         }
    1119                 : 
    1120                 :         /* Retry now without MB_ERR_INVALID_CHARS flag */
    1121                 :         wlen = MultiByteToWideChar( src_code_page, 0, src, -1, 0, 0 );
    1122                 :     }
    1123                 : 
    1124                 :     /* Do the actual conversion */
    1125                 :     wchar_t* tbuf = (wchar_t*)CPLCalloc(sizeof(wchar_t),wlen+1);
    1126                 :     tbuf[wlen] = 0;
    1127                 :     MultiByteToWideChar( src_code_page, 0, src, -1, tbuf, wlen+1 );
    1128                 : 
    1129                 :     /* Convert from Unicode to destination code page */
    1130                 : 
    1131                 :     /* Compute the length in chars */
    1132                 :     BOOL bUsedDefaultChar = FALSE;
    1133                 :     int len;
    1134                 :     if ( dst_code_page == CP_UTF7 || dst_code_page == CP_UTF8 )
    1135                 :         len = WideCharToMultiByte( dst_code_page, 0, tbuf, -1, 0, 0, 0, NULL );
    1136                 :     else
    1137                 :         len = WideCharToMultiByte( dst_code_page, 0, tbuf, -1, 0, 0, 0, &bUsedDefaultChar );
    1138                 :     if (bUsedDefaultChar)
    1139                 :     {
    1140                 :         if (!bHaveWarned6)
    1141                 :         {
    1142                 :             bHaveWarned6 = TRUE;
    1143                 :             CPLError(CE_Warning, CPLE_AppDefined,
    1144                 :                     "One or several characters could not be translated to CP%d. "
    1145                 :                     "This warning will not be emitted anymore.", dst_code_page);
    1146                 :         }
    1147                 :     }
    1148                 : 
    1149                 :     /* Do the actual conversion */
    1150                 :     char* pszResult = (char*)CPLCalloc(sizeof(char),len+1);
    1151                 :     WideCharToMultiByte( dst_code_page, 0, tbuf, -1, pszResult, len+1, 0, NULL );
    1152                 :     pszResult[len] = 0;
    1153                 : 
    1154                 :     /* Cleanup */
    1155                 :     CPLFree(tbuf);
    1156                 : 
    1157                 :     return pszResult;
    1158                 : }
    1159                 : 
    1160                 : #endif
    1161                 : 
    1162                 : 
    1163                 : 
    1164                 : /*
    1165                 : ** For now we disable the rest which is locale() related.  We may need 
    1166                 : ** parts of it later. 
    1167                 : */
    1168                 : 
    1169                 : #ifdef notdef 
    1170                 : 
    1171                 : #ifdef _WIN32
    1172                 : # include <windows.h>
    1173                 : #endif
    1174                 : 
    1175                 : /*! Return true if the "locale" seems to indicate that UTF-8 encoding
    1176                 :     is used. If true the utf8tomb and utf8frommb don't do anything
    1177                 :     useful.
    1178                 : 
    1179                 :     <i>It is highly recommended that you change your system so this
    1180                 :     does return true.</i> On Windows this is done by setting the
    1181                 :     "codepage" to CP_UTF8.  On Unix this is done by setting $LC_CTYPE
    1182                 :     to a string containing the letters "utf" or "UTF" in it, or by
    1183                 :     deleting all $LC* and $LANG environment variables. In the future
    1184                 :     it is likely that all non-Asian Unix systems will return true,
    1185                 :     due to the compatability of UTF-8 with ISO-8859-1.
    1186                 : */
    1187                 : int utf8locale(void) {
    1188                 :   static int ret = 2;
    1189                 :   if (ret == 2) {
    1190                 : #ifdef _WIN32
    1191                 :     ret = GetACP() == CP_UTF8;
    1192                 : #else
    1193                 :     char* s;
    1194                 :     ret = 1; // assumme UTF-8 if no locale
    1195                 :     if (((s = getenv("LC_CTYPE")) && *s) ||
    1196                 :   ((s = getenv("LC_ALL"))   && *s) ||
    1197                 :   ((s = getenv("LANG"))     && *s)) {
    1198                 :       ret = (strstr(s,"utf") || strstr(s,"UTF"));
    1199                 :     }
    1200                 : #endif
    1201                 :   }
    1202                 :   return ret;
    1203                 : }
    1204                 : 
    1205                 : /*! Convert the UTF-8 used by FLTK to the locale-specific encoding
    1206                 :     used for filenames (and sometimes used for data in files).
    1207                 :     Unfortunatley due to stupid design you will have to do this as
    1208                 :     needed for filenames. This is a bug on both Unix and Windows.
    1209                 : 
    1210                 :     Up to \a dstlen bytes are written to \a dst, including a null
    1211                 :     terminator. The return value is the number of bytes that would be
    1212                 :     written, not counting the null terminator. If greater or equal to
    1213                 :     \a dstlen then if you malloc a new array of size n+1 you will have
    1214                 :     the space needed for the entire string. If \a dstlen is zero then
    1215                 :     nothing is written and this call just measures the storage space
    1216                 :     needed.
    1217                 : 
    1218                 :     If utf8locale() returns true then this does not change the data.
    1219                 :     It is copied and truncated as necessary to
    1220                 :     the destination buffer and \a srclen is always returned.  */
    1221                 : unsigned utf8tomb(const char* src, unsigned srclen,
    1222                 :       char* dst, unsigned dstlen)
    1223                 : {
    1224                 :   if (!utf8locale()) {
    1225                 : #ifdef _WIN32
    1226                 :     wchar_t lbuf[1024];
    1227                 :     wchar_t* buf = lbuf;
    1228                 :     unsigned length = utf8towc(src, srclen, buf, 1024);
    1229                 :     unsigned ret;
    1230                 :     if (length >= 1024) {
    1231                 :       buf = (wchar_t*)(malloc((length+1)*sizeof(wchar_t)));
    1232                 :       utf8towc(src, srclen, buf, length+1);
    1233                 :     }
    1234                 :     if (dstlen) {
    1235                 :       // apparently this does not null-terminate, even though msdn
    1236                 :       // documentation claims it does:
    1237                 :       ret =
    1238                 :         WideCharToMultiByte(GetACP(), 0, buf, length, dst, dstlen, 0, 0);
    1239                 :       dst[ret] = 0;
    1240                 :     }
    1241                 :     // if it overflows or measuring length, get the actual length:
    1242                 :     if (dstlen==0 || ret >= dstlen-1)
    1243                 :       ret =
    1244                 :   WideCharToMultiByte(GetACP(), 0, buf, length, 0, 0, 0, 0);
    1245                 :     if (buf != lbuf) free((void*)buf);
    1246                 :     return ret;
    1247                 : #else
    1248                 :     wchar_t lbuf[1024];
    1249                 :     wchar_t* buf = lbuf;
    1250                 :     unsigned length = utf8towc(src, srclen, buf, 1024);
    1251                 :     int ret;
    1252                 :     if (length >= 1024) {
    1253                 :       buf = (wchar_t*)(malloc((length+1)*sizeof(wchar_t)));
    1254                 :       utf8towc(src, srclen, buf, length+1);
    1255                 :     }
    1256                 :     if (dstlen) {
    1257                 :       ret = wcstombs(dst, buf, dstlen);
    1258                 :       if (ret >= dstlen-1) ret = wcstombs(0,buf,0);
    1259                 :     } else {
    1260                 :       ret = wcstombs(0,buf,0);
    1261                 :     }
    1262                 :     if (buf != lbuf) free((void*)buf);
    1263                 :     if (ret >= 0) return (unsigned)ret;
    1264                 :     // on any errors we return the UTF-8 as raw text...
    1265                 : #endif
    1266                 :   }
    1267                 :   // identity transform:
    1268                 :   if (srclen < dstlen) {
    1269                 :     memcpy(dst, src, srclen);
    1270                 :     dst[srclen] = 0;
    1271                 :   } else {
    1272                 :     memcpy(dst, src, dstlen-1);
    1273                 :     dst[dstlen-1] = 0;
    1274                 :   }
    1275                 :   return srclen;
    1276                 : }
    1277                 : 
    1278                 : /*! Convert a filename from the locale-specific multibyte encoding
    1279                 :     used by Windows to UTF-8 as used by FLTK.
    1280                 : 
    1281                 :     Up to \a dstlen bytes are written to \a dst, including a null
    1282                 :     terminator. The return value is the number of bytes that would be
    1283                 :     written, not counting the null terminator. If greater or equal to
    1284                 :     \a dstlen then if you malloc a new array of size n+1 you will have
    1285                 :     the space needed for the entire string. If \a dstlen is zero then
    1286                 :     nothing is written and this call just measures the storage space
    1287                 :     needed.
    1288                 : 
    1289                 :     On Unix or on Windows when a UTF-8 locale is in effect, this
    1290                 :     does not change the data. It is copied and truncated as necessary to
    1291                 :     the destination buffer and \a srclen is always returned.
    1292                 :     You may also want to check if utf8test() returns non-zero, so that
    1293                 :     the filesystem can store filenames in UTF-8 encoding regardless of
    1294                 :     the locale.
    1295                 : */
    1296                 : unsigned utf8frommb(char* dst, unsigned dstlen,
    1297                 :         const char* src, unsigned srclen)
    1298                 : {
    1299                 :   if (!utf8locale()) {
    1300                 : #ifdef _WIN32
    1301                 :     wchar_t lbuf[1024];
    1302                 :     wchar_t* buf = lbuf;
    1303                 :     unsigned length;
    1304                 :     unsigned ret;
    1305                 :     length =
    1306                 :       MultiByteToWideChar(GetACP(), 0, src, srclen, buf, 1024);
    1307                 :     if (length >= 1024) {
    1308                 :       length = MultiByteToWideChar(GetACP(), 0, src, srclen, 0, 0);
    1309                 :       buf = (wchar_t*)(malloc(length*sizeof(wchar_t)));
    1310                 :       MultiByteToWideChar(GetACP(), 0, src, srclen, buf, length);
    1311                 :     }
    1312                 :     ret = utf8fromwc(dst, dstlen, buf, length);
    1313                 :     if (buf != lbuf) free((void*)buf);
    1314                 :     return ret;
    1315                 : #else
    1316                 :     wchar_t lbuf[1024];
    1317                 :     wchar_t* buf = lbuf;
    1318                 :     int length;
    1319                 :     unsigned ret;
    1320                 :     length = mbstowcs(buf, src, 1024);
    1321                 :     if (length >= 1024) {
    1322                 :       length = mbstowcs(0, src, 0)+1;
    1323                 :       buf = (wchar_t*)(malloc(length*sizeof(unsigned short)));
    1324                 :       mbstowcs(buf, src, length);
    1325                 :     }
    1326                 :     if (length >= 0) {
    1327                 :       ret = utf8fromwc(dst, dstlen, buf, length);
    1328                 :       if (buf != lbuf) free((void*)buf);
    1329                 :       return ret;
    1330                 :     }
    1331                 :     // errors in conversion return the UTF-8 unchanged
    1332                 : #endif
    1333                 :   }
    1334                 :   // identity transform:
    1335                 :   if (srclen < dstlen) {
    1336                 :     memcpy(dst, src, srclen);
    1337                 :     dst[srclen] = 0;
    1338                 :   } else {
    1339                 :     memcpy(dst, src, dstlen-1);
    1340                 :     dst[dstlen-1] = 0;
    1341                 :   }
    1342                 :   return srclen;
    1343                 : }
    1344                 : 
    1345                 : #endif /* def notdef - disabled locale specific stuff */
    1346                 : 
    1347                 : /*! Examines the first \a srclen bytes in \a src and return a verdict
    1348                 :     on whether it is UTF-8 or not.
    1349                 :     - Returns 0 if there is any illegal UTF-8 sequences, using the
    1350                 :       same rules as utf8decode(). Note that some UCS values considered
    1351                 :       illegal by RFC 3629, such as 0xffff, are considered legal by this.
    1352                 :     - Returns 1 if there are only single-byte characters (ie no bytes
    1353                 :       have the high bit set). This is legal UTF-8, but also indicates
    1354                 :       plain ASCII. It also returns 1 if \a srclen is zero.
    1355                 :     - Returns 2 if there are only characters less than 0x800.
    1356                 :     - Returns 3 if there are only characters less than 0x10000.
    1357                 :     - Returns 4 if there are characters in the 0x10000 to 0x10ffff range.
    1358                 : 
    1359                 :     Because there are many illegal sequences in UTF-8, it is almost
    1360                 :     impossible for a string in another encoding to be confused with
    1361                 :     UTF-8. This is very useful for transitioning Unix to UTF-8
    1362                 :     filenames, you can simply test each filename with this to decide
    1363                 :     if it is UTF-8 or in the locale encoding. My hope is that if
    1364                 :     this is done we will be able to cleanly transition to a locale-less
    1365                 :     encoding.
    1366                 : */
    1367                 : 
    1368          476054 : static int utf8test(const char* src, unsigned srclen) {
    1369          476054 :   int ret = 1;
    1370          476054 :   const char* p = src;
    1371          476054 :   const char* e = src+srclen;
    1372         3727574 :   while (p < e) {
    1373         2775467 :     if (*p & 0x80) {
    1374              82 :       int len; utf8decode(p,e,&len);
    1375              82 :       if (len < 2) return 0;
    1376              81 :       if (len > ret) ret = len;
    1377              81 :       p += len;
    1378                 :     } else {
    1379         2775385 :       p++;
    1380                 :     }
    1381                 :   }
    1382          476053 :   return ret;
    1383                 : }
    1384                 : 
    1385                 : #endif /* defined(CPL_RECODE_STUB) */
Generated by: LCOV version 1.7