LCOV - code coverage report
Current view: directory - port - cpl_recode_stub.cpp (source / functions) Found Hit Coverage
Test: gdal_filtered.info Lines: 223 111 49.8 %
Date: 2012-04-28 Functions: 10 10 100.0 %

       1                 : /**********************************************************************
       2                 :  * $Id: cpl_recode_stub.cpp 23024 2011-09-02 19:45:20Z rouault $
       3                 :  *
       4                 :  * Name:     cpl_recode_stub.cpp
       5                 :  * Project:  CPL - Common Portability Library
       6                 :  * Purpose:  Character set recoding and char/wchar_t conversions, stub
       7                 :  *           implementation to be used if iconv() functionality is not
       8                 :  *           available.
       9                 :  * Author:   Frank Warmerdam, warmerdam@pobox.com
      10                 :  *
      11                 :  * The bulk of this code is derived from the utf.c module from FLTK. It
      12                 :  * was originally downloaded from:
      13                 :  *    http://svn.easysw.com/public/fltk/fltk/trunk/src/utf.c
      14                 :  * 
      15                 :  **********************************************************************
      16                 :  * Copyright (c) 2008, Frank Warmerdam
      17                 :  * Copyright 2006 by Bill Spitzak and others.
      18                 :  *
      19                 :  * Permission to use, copy, modify, and distribute this software for any
      20                 :  * purpose with or without fee is hereby granted, provided that the above
      21                 :  * copyright notice and this permission notice appear in all copies.
      22                 :  *
      23                 :  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
      24                 :  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
      25                 :  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
      26                 :  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
      27                 :  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
      28                 :  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
      29                 :  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
      30                 :  **********************************************************************/
      31                 : 
      32                 : #include "cpl_string.h"
      33                 : 
      34                 : CPL_CVSID("$Id: cpl_recode_stub.cpp 23024 2011-09-02 19:45:20Z rouault $");
      35                 : 
      36                 : #ifdef CPL_RECODE_STUB 
      37                 : 
      38                 : static unsigned utf8decode(const char* p, const char* end, int* len);
      39                 : static unsigned utf8towc(const char* src, unsigned srclen,
      40                 :                          wchar_t* dst, unsigned dstlen);
      41                 : static unsigned utf8toa(const char* src, unsigned srclen,
      42                 :                         char* dst, unsigned dstlen);
      43                 : static unsigned utf8fromwc(char* dst, unsigned dstlen,
      44                 :                            const wchar_t* src, unsigned srclen);
      45                 : static unsigned utf8froma(char* dst, unsigned dstlen,
      46                 :                           const char* src, unsigned srclen);
      47                 : static int utf8test(const char* src, unsigned srclen);
      48                 : 
      49                 : #ifdef FUTURE_NEEDS
      50                 : static const char* utf8fwd(const char* p, const char* start, const char* end);
      51                 : static const char* utf8back(const char* p, const char* start, const char*end);
      52                 : static int utf8encode(unsigned ucs, char* buf);
      53                 : static int utf8bytes(unsigned ucs);
      54                 : #endif /* def FUTURE_NEEDS */
      55                 : 
      56                 : /************************************************************************/
      57                 : /* ==================================================================== */
      58                 : /*  Stub Implementation not depending on iconv() or WIN32 API.  */
      59                 : /* ==================================================================== */
      60                 : /************************************************************************/
      61                 : 
      62                 : /************************************************************************/
      63                 : /*                           CPLRecodeStub()                            */
      64                 : /************************************************************************/
      65                 : 
      66                 : /**
      67                 :  * Convert a string from a source encoding to a destination encoding.
      68                 :  *
      69                 :  * The only guaranteed supported encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
      70                 :  * and CPL_ENC_ISO8859_1. Currently, the following conversions are supported :
      71                 :  * <ul>
      72                 :  *  <li>CPL_ENC_ASCII -> CPL_ENC_UTF8 or CPL_ENC_ISO8859_1 (no conversion in fact)</li>
      73                 :  *  <li>CPL_ENC_ISO8859_1 -> CPL_ENC_UTF8</li>
      74                 :  *  <li>CPL_ENC_UTF8 -> CPL_ENC_ISO8859_1</li>
      75                 :  * </ul>
      76                 :  *
      77                 :  * If an error occurs an error may, or may not be posted with CPLError(). 
      78                 :  *
      79                 :  * @param pszSource a NULL terminated string.
      80                 :  * @param pszSrcEncoding the source encoding.
      81                 :  * @param pszDstEncoding the destination encoding.
      82                 :  *
      83                 :  * @return a NULL terminated string which should be freed with CPLFree().
      84                 :  */
      85                 : 
      86           17986 : char *CPLRecodeStub( const char *pszSource, 
      87                 :                      const char *pszSrcEncoding, 
      88                 :                      const char *pszDstEncoding )
      89                 : 
      90                 : {
      91                 : /* -------------------------------------------------------------------- */
      92                 : /*      If the source or destination is current locale(), we change     */
      93                 : /*      it to ISO8859-1 since our stub implementation does not          */
      94                 : /*      attempt to address locales properly.                            */
      95                 : /* -------------------------------------------------------------------- */
      96                 : 
      97           17986 :     if( pszSrcEncoding[0] == '\0' )
      98               0 :         pszSrcEncoding = CPL_ENC_ISO8859_1;
      99                 : 
     100           17986 :     if( pszDstEncoding[0] == '\0' )
     101               0 :         pszDstEncoding = CPL_ENC_ISO8859_1;
     102                 : 
     103                 : /* -------------------------------------------------------------------- */
     104                 : /*      ISO8859 to UTF8                                                 */
     105                 : /* -------------------------------------------------------------------- */
     106           17986 :     if( strcmp(pszSrcEncoding,CPL_ENC_ISO8859_1) == 0 
     107                 :         && strcmp(pszDstEncoding,CPL_ENC_UTF8) == 0 )
     108                 :     {
     109           11312 :         int nCharCount = strlen(pszSource);
     110           11312 :         char *pszResult = (char *) CPLCalloc(1,nCharCount*2+1);
     111                 :         
     112           11312 :         utf8froma( pszResult, nCharCount*2+1, pszSource, nCharCount );
     113                 :         
     114           11312 :         return pszResult;
     115                 :     }
     116                 : 
     117                 : /* -------------------------------------------------------------------- */
     118                 : /*      UTF8 to ISO8859                                                 */
     119                 : /* -------------------------------------------------------------------- */
     120            6674 :     if( strcmp(pszSrcEncoding,CPL_ENC_UTF8) == 0 
     121                 :         && strcmp(pszDstEncoding,CPL_ENC_ISO8859_1) == 0 )
     122                 :     {
     123            6674 :         int nCharCount = strlen(pszSource);
     124            6674 :         char *pszResult = (char *) CPLCalloc(1,nCharCount+1);
     125                 :         
     126            6674 :         utf8toa( pszSource, nCharCount, pszResult, nCharCount+1 );
     127                 :         
     128            6674 :         return pszResult;
     129                 :     }
     130                 : 
     131                 : /* -------------------------------------------------------------------- */
     132                 : /*      Anything else to UTF-8 is treated as ISO8859-1 to UTF-8 with    */
     133                 : /*      a one-time warning.                                             */
     134                 : /* -------------------------------------------------------------------- */
     135               0 :     if( strcmp(pszDstEncoding,CPL_ENC_UTF8) == 0 )
     136                 :     {
     137               0 :         int nCharCount = strlen(pszSource);
     138               0 :         char *pszResult = (char *) CPLCalloc(1,nCharCount*2+1);
     139                 :         static int bHaveWarned = FALSE;
     140                 : 
     141               0 :         if( !bHaveWarned )
     142                 :         {
     143               0 :             bHaveWarned = 1;
     144                 :             CPLError( CE_Warning, CPLE_AppDefined, 
     145                 :                       "Recode from %s to UTF-8 not supported, treated as ISO8859-1 to UTF-8.", 
     146               0 :                       pszSrcEncoding );
     147                 :         }
     148                 : 
     149               0 :         utf8froma( pszResult, nCharCount*2+1, pszSource, nCharCount );
     150                 :         
     151               0 :         return pszResult;
     152                 :     }
     153                 : 
     154                 : /* -------------------------------------------------------------------- */
     155                 : /*      UTF-8 to anything else is treated as UTF-8 to ISO-8859-1        */
     156                 : /*      with a warning.                                                 */
     157                 : /* -------------------------------------------------------------------- */
     158               0 :     if( strcmp(pszSrcEncoding,CPL_ENC_UTF8) == 0 
     159                 :         && strcmp(pszDstEncoding,CPL_ENC_ISO8859_1) == 0 )
     160                 :     {
     161               0 :         int nCharCount = strlen(pszSource);
     162               0 :         char *pszResult = (char *) CPLCalloc(1,nCharCount+1);
     163                 :         static int bHaveWarned = FALSE;
     164                 : 
     165               0 :         if( !bHaveWarned )
     166                 :         {
     167               0 :             bHaveWarned = 1;
     168                 :             CPLError( CE_Warning, CPLE_AppDefined, 
     169                 :                       "Recode from UTF-8 to %s not supported, treated as UTF-8 to ISO8859-1.", 
     170               0 :                       pszDstEncoding );
     171                 :         }
     172                 :         
     173               0 :         utf8toa( pszSource, nCharCount, pszResult, nCharCount+1 );
     174                 :         
     175               0 :         return pszResult;
     176                 :     }
     177                 : 
     178                 : /* -------------------------------------------------------------------- */
     179                 : /*      Everything else is treated as a no-op with a warning.           */
     180                 : /* -------------------------------------------------------------------- */
     181                 :     {
     182                 :         static int bHaveWarned = FALSE;
     183                 : 
     184               0 :         if( !bHaveWarned )
     185                 :         {
     186               0 :             bHaveWarned = 1;
     187                 :             CPLError( CE_Warning, CPLE_AppDefined, 
     188                 :                       "Recode from %s to %s not supported, no change applied.", 
     189               0 :                       pszSrcEncoding, pszDstEncoding );
     190                 :         }
     191                 :         
     192               0 :         return CPLStrdup(pszSource);
     193                 :     }
     194                 : }
     195                 : 
     196                 : /************************************************************************/
     197                 : /*                       CPLRecodeFromWCharStub()                       */
     198                 : /************************************************************************/
     199                 : 
     200                 : /**
     201                 :  * Convert wchar_t string to UTF-8. 
     202                 :  *
     203                 :  * Convert a wchar_t string into a multibyte utf-8 string.  The only
     204                 :  * guaranteed supported source encoding is CPL_ENC_UCS2, and the only
     205                 :  * guaranteed supported destination encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
     206                 :  * and CPL_ENC_ISO8859_1.  In some cases (ie. using iconv()) other encodings 
     207                 :  * may also be supported.
     208                 :  *
     209                 :  * Note that the wchar_t type varies in size on different systems. On
     210                 :  * win32 it is normally 2 bytes, and on unix 4 bytes.
     211                 :  *
     212                 :  * If an error occurs an error may, or may not be posted with CPLError(). 
     213                 :  *
     214                 :  * @param pwszSource the source wchar_t string, terminated with a 0 wchar_t.
     215                 :  * @param pszSrcEncoding the source encoding, typically CPL_ENC_UCS2.
     216                 :  * @param pszDstEncoding the destination encoding, typically CPL_ENC_UTF8.
     217                 :  *
     218                 :  * @return a zero terminated multi-byte string which should be freed with 
     219                 :  * CPLFree(), or NULL if an error occurs. 
     220                 :  */
     221                 : 
     222            2248 : char *CPLRecodeFromWCharStub( const wchar_t *pwszSource, 
     223                 :                               const char *pszSrcEncoding, 
     224                 :                               const char *pszDstEncoding )
     225                 : 
     226                 : {
     227                 : /* -------------------------------------------------------------------- */
     228                 : /*      We try to avoid changes of character set.  We are just          */
     229                 : /*      providing for unicode to unicode.                               */
     230                 : /* -------------------------------------------------------------------- */
     231            2248 :     if( strcmp(pszSrcEncoding,"WCHAR_T") != 0 &&
     232                 :         strcmp(pszSrcEncoding,CPL_ENC_UTF8) != 0
     233                 :         && strcmp(pszSrcEncoding,CPL_ENC_UTF16) != 0
     234                 :         && strcmp(pszSrcEncoding,CPL_ENC_UCS2) != 0
     235                 :         && strcmp(pszSrcEncoding,CPL_ENC_UCS4) != 0 )
     236                 :     {
     237                 :         CPLError( CE_Failure, CPLE_AppDefined,
     238                 :                   "Stub recoding implementation does not support\n"
     239                 :                   "CPLRecodeFromWCharStub(...,%s,%s)", 
     240               0 :                   pszSrcEncoding, pszDstEncoding );
     241               0 :         return NULL;
     242                 :     }
     243                 : 
     244                 : /* -------------------------------------------------------------------- */
     245                 : /*      What is the source length.                                      */
     246                 : /* -------------------------------------------------------------------- */
     247            2248 :     int nSrcLen = 0;
     248                 : 
     249           22706 :     while( pwszSource[nSrcLen] != 0 )
     250           18210 :         nSrcLen++;
     251                 : 
     252                 : /* -------------------------------------------------------------------- */
     253                 : /*      Allocate destination buffer plenty big.                         */
     254                 : /* -------------------------------------------------------------------- */
     255                 :     char *pszResult;
     256                 :     int nDstBufSize, nDstLen;
     257                 : 
     258            2248 :     nDstBufSize = nSrcLen * 4 + 1;
     259            2248 :     pszResult = (char *) CPLMalloc(nDstBufSize); // nearly worst case.
     260                 : 
     261            2248 :     if (nSrcLen == 0)
     262                 :     {
     263               0 :         pszResult[0] = '\0';
     264               0 :         return pszResult;
     265                 :     }
     266                 : 
     267                 : /* -------------------------------------------------------------------- */
     268                 : /*      Convert, and confirm we had enough space.                       */
     269                 : /* -------------------------------------------------------------------- */
     270            2248 :     nDstLen = utf8fromwc( pszResult, nDstBufSize, pwszSource, nSrcLen );
     271            2248 :     if( nDstLen >= nDstBufSize - 1 )
     272                 :     {
     273               0 :         CPLAssert( FALSE ); // too small!
     274               0 :         return NULL;
     275                 :     }
     276                 : 
     277                 : /* -------------------------------------------------------------------- */
     278                 : /*      If something other than UTF-8 was requested, recode now.        */
     279                 : /* -------------------------------------------------------------------- */
     280            2248 :     if( strcmp(pszDstEncoding,CPL_ENC_UTF8) == 0 )
     281            2248 :         return pszResult;
     282                 : 
     283                 :     char *pszFinalResult = 
     284               0 :         CPLRecodeStub( pszResult, CPL_ENC_UTF8, pszDstEncoding );
     285                 : 
     286               0 :     CPLFree( pszResult );
     287                 :     
     288               0 :     return pszFinalResult;
     289                 : }
     290                 : 
     291                 : /************************************************************************/
     292                 : /*                        CPLRecodeToWCharStub()                        */
     293                 : /************************************************************************/
     294                 : 
     295                 : /**
     296                 :  * Convert UTF-8 string to a wchar_t string.
     297                 :  *
     298                 :  * Convert a 8bit, multi-byte per character input string into a wide
     299                 :  * character (wchar_t) string.  The only guaranteed supported source encodings
     300                 :  * are CPL_ENC_UTF8, CPL_ENC_ASCII and CPL_ENC_ISO8869_1 (LATIN1).  The only
     301                 :  * guaranteed supported destination encoding is CPL_ENC_UCS2.  Other source
     302                 :  * and destination encodings may be supported depending on the underlying
     303                 :  * implementation. 
     304                 :  *
     305                 :  * Note that the wchar_t type varies in size on different systems. On
     306                 :  * win32 it is normally 2 bytes, and on unix 4 bytes.
     307                 :  *
     308                 :  * If an error occurs an error may, or may not be posted with CPLError(). 
     309                 :  *
     310                 :  * @param pszSource input multi-byte character string.
     311                 :  * @param pszSrcEncoding source encoding, typically CPL_ENC_UTF8.
     312                 :  * @param pszDstEncoding destination encoding, typically CPL_ENC_UCS2. 
     313                 :  *
     314                 :  * @return the zero terminated wchar_t string (to be freed with CPLFree()) or
     315                 :  * NULL on error.
     316                 :  *
     317                 :  * @since GDAL 1.6.0
     318                 :  */
     319                 : 
     320            5140 : wchar_t *CPLRecodeToWCharStub( const char *pszSource,
     321                 :                                const char *pszSrcEncoding, 
     322                 :                                const char *pszDstEncoding )
     323                 : 
     324                 : {
     325            5140 :     char *pszUTF8Source = (char *) pszSource;
     326                 : 
     327            5140 :     if( strcmp(pszSrcEncoding,CPL_ENC_UTF8) != 0 
     328                 :         && strcmp(pszSrcEncoding,CPL_ENC_ASCII) != 0 )
     329                 :     {
     330               0 :         pszUTF8Source = CPLRecodeStub( pszSource, pszSrcEncoding, CPL_ENC_UTF8 );
     331               0 :         if( pszUTF8Source == NULL )
     332               0 :             return NULL;
     333                 :     }
     334                 : 
     335                 : /* -------------------------------------------------------------------- */
     336                 : /*      We try to avoid changes of character set.  We are just          */
     337                 : /*      providing for unicode to unicode.                               */
     338                 : /* -------------------------------------------------------------------- */
     339            5140 :     if( strcmp(pszDstEncoding,"WCHAR_T") != 0
     340                 :         && strcmp(pszDstEncoding,CPL_ENC_UCS2) != 0
     341                 :         && strcmp(pszDstEncoding,CPL_ENC_UCS4) != 0 
     342                 :         && strcmp(pszDstEncoding,CPL_ENC_UTF16) != 0 )
     343                 :     {
     344                 :         CPLError( CE_Failure, CPLE_AppDefined,
     345                 :                   "Stub recoding implementation does not support\n"
     346                 :                   "CPLRecodeToWCharStub(...,%s,%s)", 
     347               0 :                   pszSrcEncoding, pszDstEncoding );
     348               0 :         return NULL;
     349                 :     }
     350                 : 
     351                 : /* -------------------------------------------------------------------- */
     352                 : /*      Do the UTF-8 to UCS-2 recoding.                                 */
     353                 : /* -------------------------------------------------------------------- */
     354            5140 :     int nSrcLen = strlen(pszUTF8Source);
     355            5140 :     wchar_t *pwszResult = (wchar_t *) CPLCalloc(sizeof(wchar_t),nSrcLen+1);
     356                 : 
     357            5140 :     utf8towc( pszUTF8Source, nSrcLen, pwszResult, nSrcLen+1 );
     358                 : 
     359            5140 :     if( pszUTF8Source != pszSource )
     360               0 :         CPLFree( pszUTF8Source );
     361                 : 
     362            5140 :     return pwszResult;
     363                 : }
     364                 : 
     365                 : 
     366                 : /************************************************************************/
     367                 : /*                                 CPLIsUTF8()                          */
     368                 : /************************************************************************/
     369                 : 
     370                 : /**
     371                 :  * Test if a string is encoded as UTF-8.
     372                 :  *
     373                 :  * @param pabyData input string to test
     374                 :  * @param nLen length of the input string, or -1 if the function must compute
     375                 :  *             the string length. In which case it must be null terminated.
     376                 :  * @return TRUE if the string is encoded as UTF-8. FALSE otherwise
     377                 :  *
     378                 :  * @since GDAL 1.7.0
     379                 :  */
     380          951570 : int CPLIsUTF8Stub(const char* pabyData, int nLen)
     381                 : {
     382          951570 :     if (nLen < 0)
     383          951570 :         nLen = strlen(pabyData);
     384          951570 :     return utf8test(pabyData, (unsigned)nLen) != 0;
     385                 : }
     386                 : 
     387                 : /************************************************************************/
     388                 : /* ==================================================================== */
     389                 : /*  UTF.C code from FLTK with some modifications.                   */
     390                 : /* ==================================================================== */
     391                 : /************************************************************************/
     392                 : 
     393                 : /* Set to 1 to turn bad UTF8 bytes into ISO-8859-1. If this is to zero
     394                 :    they are instead turned into the Unicode REPLACEMENT CHARACTER, of
     395                 :    value 0xfffd.
     396                 :    If this is on utf8decode will correctly map most (perhaps all)
     397                 :    human-readable text that is in ISO-8859-1. This may allow you
     398                 :    to completely ignore character sets in your code because virtually
     399                 :    everything is either ISO-8859-1 or UTF-8.
     400                 : */
     401                 : #define ERRORS_TO_ISO8859_1 1
     402                 : 
     403                 : /* Set to 1 to turn bad UTF8 bytes in the 0x80-0x9f range into the
     404                 :    Unicode index for Microsoft's CP1252 character set. You should
     405                 :    also set ERRORS_TO_ISO8859_1. With this a huge amount of more
     406                 :    available text (such as all web pages) are correctly converted
     407                 :    to Unicode.
     408                 : */
     409                 : #define ERRORS_TO_CP1252 1
     410                 : 
     411                 : /* A number of Unicode code points are in fact illegal and should not
     412                 :    be produced by a UTF-8 converter. Turn this on will replace the
     413                 :    bytes in those encodings with errors. If you do this then converting
     414                 :    arbitrary 16-bit data to UTF-8 and then back is not an identity,
     415                 :    which will probably break a lot of software.
     416                 : */
     417                 : #define STRICT_RFC3629 0
     418                 : 
     419                 : #if ERRORS_TO_CP1252
     420                 : // Codes 0x80..0x9f from the Microsoft CP1252 character set, translated
     421                 : // to Unicode:
     422                 : static unsigned short cp1252[32] = {
     423                 :   0x20ac, 0x0081, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,
     424                 :   0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008d, 0x017d, 0x008f,
     425                 :   0x0090, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
     426                 :   0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x009d, 0x017e, 0x0178
     427                 : };
     428                 : #endif
     429                 : 
     430                 : /************************************************************************/
     431                 : /*                             utf8decode()                             */
     432                 : /************************************************************************/
     433                 : 
     434                 : /*
     435                 :     Decode a single UTF-8 encoded character starting at \e p. The
     436                 :     resulting Unicode value (in the range 0-0x10ffff) is returned,
     437                 :     and \e len is set the the number of bytes in the UTF-8 encoding
     438                 :     (adding \e len to \e p will point at the next character).
     439                 : 
     440                 :     If \a p points at an illegal UTF-8 encoding, including one that
     441                 :     would go past \e end, or where a code is uses more bytes than
     442                 :     necessary, then *(unsigned char*)p is translated as though it is
     443                 :     in the Microsoft CP1252 character set and \e len is set to 1.
     444                 :     Treating errors this way allows this to decode almost any
     445                 :     ISO-8859-1 or CP1252 text that has been mistakenly placed where
     446                 :     UTF-8 is expected, and has proven very useful.
     447                 : 
     448                 :     If you want errors to be converted to error characters (as the
     449                 :     standards recommend), adding a test to see if the length is
     450                 :     unexpectedly 1 will work:
     451                 : 
     452                 : \code
     453                 :     if (*p & 0x80) { // what should be a multibyte encoding
     454                 :       code = utf8decode(p,end,&len);
     455                 :       if (len<2) code = 0xFFFD; // Turn errors into REPLACEMENT CHARACTER
     456                 :     } else { // handle the 1-byte utf8 encoding:
     457                 :       code = *p;
     458                 :       len = 1;
     459                 :     }
     460                 : \endcode
     461                 : 
     462                 :     Direct testing for the 1-byte case (as shown above) will also
     463                 :     speed up the scanning of strings where the majority of characters
     464                 :     are ASCII.
     465                 : */
     466             400 : static unsigned utf8decode(const char* p, const char* end, int* len)
     467                 : {
     468             400 :   unsigned char c = *(unsigned char*)p;
     469             400 :   if (c < 0x80) {
     470               0 :     *len = 1;
     471               0 :     return c;
     472                 : #if ERRORS_TO_CP1252
     473             400 :   } else if (c < 0xa0) {
     474               2 :     *len = 1;
     475               2 :     return cp1252[c-0x80];
     476                 : #endif
     477             398 :   } else if (c < 0xc2) {
     478               0 :     goto FAIL;
     479                 :   }
     480             398 :   if (p+1 >= end || (p[1]&0xc0) != 0x80) goto FAIL;
     481             398 :   if (c < 0xe0) {
     482             398 :     *len = 2;
     483                 :     return
     484             398 :       ((p[0] & 0x1f) << 6) +
     485             398 :       ((p[1] & 0x3f));
     486               0 :   } else if (c == 0xe0) {
     487               0 :     if (((unsigned char*)p)[1] < 0xa0) goto FAIL;
     488               0 :     goto UTF8_3;
     489                 : #if STRICT_RFC3629
     490                 :   } else if (c == 0xed) {
     491                 :     // RFC 3629 says surrogate chars are illegal.
     492                 :     if (((unsigned char*)p)[1] >= 0xa0) goto FAIL;
     493                 :     goto UTF8_3;
     494                 :   } else if (c == 0xef) {
     495                 :     // 0xfffe and 0xffff are also illegal characters
     496                 :     if (((unsigned char*)p)[1]==0xbf &&
     497                 :   ((unsigned char*)p)[2]>=0xbe) goto FAIL;
     498                 :     goto UTF8_3;
     499                 : #endif
     500               0 :   } else if (c < 0xf0) {
     501                 :   UTF8_3:
     502               0 :     if (p+2 >= end || (p[2]&0xc0) != 0x80) goto FAIL;
     503               0 :     *len = 3;
     504                 :     return
     505               0 :       ((p[0] & 0x0f) << 12) +
     506               0 :       ((p[1] & 0x3f) << 6) +
     507               0 :       ((p[2] & 0x3f));
     508               0 :   } else if (c == 0xf0) {
     509               0 :     if (((unsigned char*)p)[1] < 0x90) goto FAIL;
     510               0 :     goto UTF8_4;
     511               0 :   } else if (c < 0xf4) {
     512                 :   UTF8_4:
     513               0 :     if (p+3 >= end || (p[2]&0xc0) != 0x80 || (p[3]&0xc0) != 0x80) goto FAIL;
     514               0 :     *len = 4;
     515                 : #if STRICT_RFC3629
     516                 :     // RFC 3629 says all codes ending in fffe or ffff are illegal:
     517                 :     if ((p[1]&0xf)==0xf &&
     518                 :   ((unsigned char*)p)[2] == 0xbf &&
     519                 :   ((unsigned char*)p)[3] >= 0xbe) goto FAIL;
     520                 : #endif
     521                 :     return
     522               0 :       ((p[0] & 0x07) << 18) +
     523               0 :       ((p[1] & 0x3f) << 12) +
     524               0 :       ((p[2] & 0x3f) << 6) +
     525               0 :       ((p[3] & 0x3f));
     526               0 :   } else if (c == 0xf4) {
     527               0 :     if (((unsigned char*)p)[1] > 0x8f) goto FAIL; // after 0x10ffff
     528               0 :     goto UTF8_4;
     529                 :   } else {
     530                 :   FAIL:
     531               0 :     *len = 1;
     532                 : #if ERRORS_TO_ISO8859_1
     533               0 :     return c;
     534                 : #else
     535                 :     return 0xfffd; // Unicode REPLACEMENT CHARACTER
     536                 : #endif
     537                 :   }
     538                 : }
     539                 : 
     540                 : /************************************************************************/
     541                 : /*                              utf8fwd()                               */
     542                 : /************************************************************************/
     543                 : 
     544                 : /*
     545                 :   Move \a p forward until it points to the start of a UTF-8
     546                 :   character. If it already points at the start of one then it
     547                 :   is returned unchanged. Any UTF-8 errors are treated as though each
     548                 :   byte of the error is an individual character.
     549                 : 
     550                 :   \e start is the start of the string and is used to limit the
     551                 :   backwards search for the start of a utf8 character.
     552                 : 
     553                 :   \e end is the end of the string and is assummed to be a break
     554                 :   between characters. It is assummed to be greater than p.
     555                 : 
     556                 :   This function is for moving a pointer that was jumped to the
     557                 :   middle of a string, such as when doing a binary search for
     558                 :   a position. You should use either this or utf8back() depending
     559                 :   on which direction your algorithim can handle the pointer
     560                 :   moving. Do not use this to scan strings, use utf8decode()
     561                 :   instead.
     562                 : */
     563                 : 
     564                 : #ifdef FUTURE_NEEDS
     565                 : static const char* utf8fwd(const char* p, const char* start, const char* end)
     566                 : {
     567                 :   const char* a;
     568                 :   int len;
     569                 :   // if we are not pointing at a continuation character, we are done:
     570                 :   if ((*p&0xc0) != 0x80) return p;
     571                 :   // search backwards for a 0xc0 starting the character:
     572                 :   for (a = p-1; ; --a) {
     573                 :     if (a < start) return p;
     574                 :     if (!(a[0]&0x80)) return p;
     575                 :     if ((a[0]&0x40)) break;
     576                 :   }
     577                 :   utf8decode(a,end,&len);
     578                 :   a += len;
     579                 :   if (a > p) return a;
     580                 :   return p;
     581                 : }
     582                 : #endif /* def FUTURE_NEEDS */
     583                 : 
     584                 : /************************************************************************/
     585                 : /*                              utf8back()                              */
     586                 : /************************************************************************/
     587                 : 
     588                 : /*
     589                 :   Move \a p backward until it points to the start of a UTF-8
     590                 :   character. If it already points at the start of one then it
     591                 :   is returned unchanged. Any UTF-8 errors are treated as though each
     592                 :   byte of the error is an individual character.
     593                 : 
     594                 :   \e start is the start of the string and is used to limit the
     595                 :   backwards search for the start of a UTF-8 character.
     596                 : 
     597                 :   \e end is the end of the string and is assummed to be a break
     598                 :   between characters. It is assummed to be greater than p.
     599                 : 
     600                 :   If you wish to decrement a UTF-8 pointer, pass p-1 to this.
     601                 : */
     602                 : 
     603                 : #ifdef FUTURE_NEEDS
     604                 : static const char* utf8back(const char* p, const char* start, const char* end)
     605                 : {
     606                 :   const char* a;
     607                 :   int len;
     608                 :   // if we are not pointing at a continuation character, we are done:
     609                 :   if ((*p&0xc0) != 0x80) return p;
     610                 :   // search backwards for a 0xc0 starting the character:
     611                 :   for (a = p-1; ; --a) {
     612                 :     if (a < start) return p;
     613                 :     if (!(a[0]&0x80)) return p;
     614                 :     if ((a[0]&0x40)) break;
     615                 :   }
     616                 :   utf8decode(a,end,&len);
     617                 :   if (a+len > p) return a;
     618                 :   return p;
     619                 : }
     620                 : #endif /* def FUTURE_NEEDS */
     621                 : 
     622                 : /************************************************************************/
     623                 : /*                             utf8bytes()                              */
     624                 : /************************************************************************/
     625                 : 
     626                 : /* Returns number of bytes that utf8encode() will use to encode the
     627                 :   character \a ucs. */
     628                 : #ifdef FUTURE_NEEDS
     629                 : static int utf8bytes(unsigned ucs) {
     630                 :   if (ucs < 0x000080U) {
     631                 :     return 1;
     632                 :   } else if (ucs < 0x000800U) {
     633                 :     return 2;
     634                 :   } else if (ucs < 0x010000U) {
     635                 :     return 3;
     636                 :   } else if (ucs < 0x10ffffU) {
     637                 :     return 4;
     638                 :   } else {
     639                 :     return 3; // length of the illegal character encoding
     640                 :   }
     641                 : }
     642                 : #endif /* def FUTURE_NEEDS */
     643                 : 
     644                 : /************************************************************************/
     645                 : /*                             utf8encode()                             */
     646                 : /************************************************************************/
     647                 : 
     648                 : /* Write the UTF-8 encoding of \e ucs into \e buf and return the
     649                 :     number of bytes written. Up to 4 bytes may be written. If you know
     650                 :     that \a ucs is less than 0x10000 then at most 3 bytes will be written.
     651                 :     If you wish to speed this up, remember that anything less than 0x80
     652                 :     is written as a single byte.
     653                 : 
     654                 :     If ucs is greater than 0x10ffff this is an illegal character
     655                 :     according to RFC 3629. These are converted as though they are
     656                 :     0xFFFD (REPLACEMENT CHARACTER).
     657                 : 
     658                 :     RFC 3629 also says many other values for \a ucs are illegal (in
     659                 :     the range 0xd800 to 0xdfff, or ending with 0xfffe or
     660                 :     0xffff). However I encode these as though they are legal, so that
     661                 :     utf8encode/utf8decode will be the identity for all codes between 0
     662                 :     and 0x10ffff.
     663                 : */
     664                 : #ifdef FUTURE_NEEDS
     665                 : static int utf8encode(unsigned ucs, char* buf) {
     666                 :   if (ucs < 0x000080U) {
     667                 :     buf[0] = ucs;
     668                 :     return 1;
     669                 :   } else if (ucs < 0x000800U) {
     670                 :     buf[0] = 0xc0 | (ucs >> 6);
     671                 :     buf[1] = 0x80 | (ucs & 0x3F);
     672                 :     return 2;
     673                 :   } else if (ucs < 0x010000U) {
     674                 :     buf[0] = 0xe0 | (ucs >> 12);
     675                 :     buf[1] = 0x80 | ((ucs >> 6) & 0x3F);
     676                 :     buf[2] = 0x80 | (ucs & 0x3F);
     677                 :     return 3;
     678                 :   } else if (ucs < 0x0010ffffU) {
     679                 :     buf[0] = 0xf0 | (ucs >> 18);
     680                 :     buf[1] = 0x80 | ((ucs >> 12) & 0x3F);
     681                 :     buf[2] = 0x80 | ((ucs >> 6) & 0x3F);
     682                 :     buf[3] = 0x80 | (ucs & 0x3F);
     683                 :     return 4;
     684                 :   } else {
     685                 :     // encode 0xfffd:
     686                 :     buf[0] = 0xefU;
     687                 :     buf[1] = 0xbfU;
     688                 :     buf[2] = 0xbdU;
     689                 :     return 3;
     690                 :   }
     691                 : }
     692                 : #endif /* def FUTURE_NEEDS */
     693                 : 
     694                 : /************************************************************************/
     695                 : /*                              utf8towc()                              */
     696                 : /************************************************************************/
     697                 : 
     698                 : /*  Convert a UTF-8 sequence into an array of wchar_t. These
     699                 :     are used by some system calls, especially on Windows.
     700                 : 
     701                 :     \a src points at the UTF-8, and \a srclen is the number of bytes to
     702                 :     convert.
     703                 : 
     704                 :     \a dst points at an array to write, and \a dstlen is the number of
     705                 :     locations in this array. At most \a dstlen-1 words will be
     706                 :     written there, plus a 0 terminating word. Thus this function
     707                 :     will never overwrite the buffer and will always return a
     708                 :     zero-terminated string. If \a dstlen is zero then \a dst can be
     709                 :     null and no data is written, but the length is returned.
     710                 : 
     711                 :     The return value is the number of words that \e would be written
     712                 :     to \a dst if it were long enough, not counting the terminating
     713                 :     zero. If the return value is greater or equal to \a dstlen it
     714                 :     indicates truncation, you can then allocate a new array of size
     715                 :     return+1 and call this again.
     716                 : 
     717                 :     Errors in the UTF-8 are converted as though each byte in the
     718                 :     erroneous string is in the Microsoft CP1252 encoding. This allows
     719                 :     ISO-8859-1 text mistakenly identified as UTF-8 to be printed
     720                 :     correctly.
     721                 : 
     722                 :     Notice that sizeof(wchar_t) is 2 on Windows and is 4 on Linux
     723                 :     and most other systems. Where wchar_t is 16 bits, Unicode
     724                 :     characters in the range 0x10000 to 0x10ffff are converted to
     725                 :     "surrogate pairs" which take two words each (this is called UTF-16
     726                 :     encoding). If wchar_t is 32 bits this rather nasty problem is
     727                 :     avoided.
     728                 : */
     729            5140 : static unsigned utf8towc(const char* src, unsigned srclen,
     730                 :                          wchar_t* dst, unsigned dstlen)
     731                 : {
     732            5140 :   const char* p = src;
     733            5140 :   const char* e = src+srclen;
     734            5140 :   unsigned count = 0;
     735           24664 :   if (dstlen) for (;;) {
     736           24664 :     if (p >= e) {dst[count] = 0; return count;}
     737           19524 :     if (!(*p & 0x80)) { // ascii
     738           19268 :       dst[count] = *p++;
     739                 :     } else {
     740             256 :       int len; unsigned ucs = utf8decode(p,e,&len);
     741             256 :       p += len;
     742                 : #ifdef _WIN32
     743                 :       if (ucs < 0x10000) {
     744                 :           dst[count] = (wchar_t)ucs;
     745                 :       } else {
     746                 :   // make a surrogate pair:
     747                 :   if (count+2 >= dstlen) {dst[count] = 0; count += 2; break;}
     748                 :         dst[count] = (wchar_t)((((ucs-0x10000u)>>10)&0x3ff) | 0xd800);
     749                 :         dst[++count] = (wchar_t)((ucs&0x3ff) | 0xdc00);
     750                 :       }
     751                 : #else
     752             256 :       dst[count] = (wchar_t)ucs;
     753                 : #endif
     754                 :     }
     755           19524 :     if (++count == dstlen) {dst[count-1] = 0; break;}
     756                 :   }
     757                 :   // we filled dst, measure the rest:
     758               0 :   while (p < e) {
     759               0 :     if (!(*p & 0x80)) p++;
     760                 :     else {
     761                 : #ifdef _WIN32
     762                 :       int len; unsigned ucs = utf8decode(p,e,&len);
     763                 :       p += len;
     764                 :       if (ucs >= 0x10000) ++count;
     765                 : #else
     766               0 :       int len; utf8decode(p,e,&len);
     767               0 :       p += len;
     768                 : #endif
     769                 :     }
     770               0 :     ++count;
     771                 :   }
     772               0 :   return count;
     773                 : }
     774                 : 
     775                 : /************************************************************************/
     776                 : /*                              utf8toa()                               */
     777                 : /************************************************************************/
     778                 : /* Convert a UTF-8 sequence into an array of 1-byte characters.
     779                 : 
     780                 :     If the UTF-8 decodes to a character greater than 0xff then it is
     781                 :     replaced with '?'.
     782                 : 
     783                 :     Errors in the UTF-8 are converted as individual bytes, same as
     784                 :     utf8decode() does. This allows ISO-8859-1 text mistakenly identified
     785                 :     as UTF-8 to be printed correctly (and possibly CP1512 on Windows).
     786                 : 
     787                 :     \a src points at the UTF-8, and \a srclen is the number of bytes to
     788                 :     convert.
     789                 : 
     790                 :     Up to \a dstlen bytes are written to \a dst, including a null
     791                 :     terminator. The return value is the number of bytes that would be
     792                 :     written, not counting the null terminator. If greater or equal to
     793                 :     \a dstlen then if you malloc a new array of size n+1 you will have
     794                 :     the space needed for the entire string. If \a dstlen is zero then
     795                 :     nothing is written and this call just measures the storage space
     796                 :     needed.
     797                 : */
     798            6674 : static unsigned utf8toa(const char* src, unsigned srclen,
     799                 :                         char* dst, unsigned dstlen)
     800                 : {
     801            6674 :   const char* p = src;
     802            6674 :   const char* e = src+srclen;
     803            6674 :   unsigned count = 0;
     804           49204 :   if (dstlen) for (;;) {
     805                 :     unsigned char c;
     806           49204 :     if (p >= e) {dst[count] = 0; return count;}
     807           42530 :     c = *(unsigned char*)p;
     808           42530 :     if (c < 0xC2) { // ascii or bad code
     809           42530 :       dst[count] = c;
     810           42530 :       p++;
     811                 :     } else {
     812               0 :       int len; unsigned ucs = utf8decode(p,e,&len);
     813               0 :       p += len;
     814               0 :       if (ucs < 0x100) dst[count] = (char)ucs;
     815                 :       else
     816                 :       {
     817                 :           static int bHasWarned = FALSE;
     818               0 :           if (!bHasWarned)
     819                 :           {
     820               0 :               bHasWarned = TRUE;
     821                 :               CPLError(CE_Warning, CPLE_AppDefined,
     822                 :                        "One or several characters couldn't be converted correctly from UTF-8 to ISO-8859-1.\n"
     823               0 :                        "This warning will not be emitted anymore");
     824                 :           }
     825               0 :           dst[count] = '?';
     826                 :       }
     827                 :     }
     828           42530 :     if (++count >= dstlen) {dst[count-1] = 0; break;}
     829                 :   }
     830                 :   // we filled dst, measure the rest:
     831               0 :   while (p < e) {
     832               0 :     if (!(*p & 0x80)) p++;
     833                 :     else {
     834                 :       int len;
     835               0 :       utf8decode(p,e,&len);
     836               0 :       p += len;
     837                 :     }
     838               0 :     ++count;
     839                 :   }
     840               0 :   return count;
     841                 : }
     842                 : 
     843                 : /************************************************************************/
     844                 : /*                             utf8fromwc()                             */
     845                 : /************************************************************************/
     846                 : /* Turn "wide characters" as returned by some system calls
     847                 :     (especially on Windows) into UTF-8.
     848                 : 
     849                 :     Up to \a dstlen bytes are written to \a dst, including a null
     850                 :     terminator. The return value is the number of bytes that would be
     851                 :     written, not counting the null terminator. If greater or equal to
     852                 :     \a dstlen then if you malloc a new array of size n+1 you will have
     853                 :     the space needed for the entire string. If \a dstlen is zero then
     854                 :     nothing is written and this call just measures the storage space
     855                 :     needed.
     856                 : 
     857                 :     \a srclen is the number of words in \a src to convert. On Windows
     858                 :     this is not necessairly the number of characters, due to there
     859                 :     possibly being "surrogate pairs" in the UTF-16 encoding used.
     860                 :     On Unix wchar_t is 32 bits and each location is a character.
     861                 : 
     862                 :     On Unix if a src word is greater than 0x10ffff then this is an
     863                 :     illegal character according to RFC 3629. These are converted as
     864                 :     though they are 0xFFFD (REPLACEMENT CHARACTER). Characters in the
     865                 :     range 0xd800 to 0xdfff, or ending with 0xfffe or 0xffff are also
     866                 :     illegal according to RFC 3629. However I encode these as though
     867                 :     they are legal, so that utf8towc will return the original data.
     868                 : 
     869                 :     On Windows "surrogate pairs" are converted to a single character
     870                 :     and UTF-8 encoded (as 4 bytes). Mismatched halves of surrogate
     871                 :     pairs are converted as though they are individual characters.
     872                 : */
     873            2248 : static unsigned utf8fromwc(char* dst, unsigned dstlen,
     874                 :                            const wchar_t* src, unsigned srclen) {
     875            2248 :   unsigned i = 0;
     876            2248 :   unsigned count = 0;
     877           20458 :   if (dstlen) for (;;) {
     878                 :     unsigned ucs;
     879           20458 :     if (i >= srclen) {dst[count] = 0; return count;}
     880           18210 :     ucs = src[i++];
     881           18210 :     if (ucs < 0x80U) {
     882           16470 :       dst[count++] = (char)ucs;
     883           16470 :       if (count >= dstlen) {dst[count-1] = 0; break;}
     884            1740 :     } else if (ucs < 0x800U) { // 2 bytes
     885            1740 :       if (count+2 >= dstlen) {dst[count] = 0; count += 2; break;}
     886            1740 :       dst[count++] = 0xc0 | (char)(ucs >> 6);
     887            1740 :       dst[count++] = 0x80 | (char)(ucs & 0x3F);
     888                 : #ifdef _WIN32
     889                 :     } else if (ucs >= 0xd800 && ucs <= 0xdbff && i < srclen &&
     890                 :          src[i] >= 0xdc00 && src[i] <= 0xdfff) {
     891                 :       // surrogate pair
     892                 :       unsigned ucs2 = src[i++];
     893                 :       ucs = 0x10000U + ((ucs&0x3ff)<<10) + (ucs2&0x3ff);
     894                 :       // all surrogate pairs turn into 4-byte utf8
     895                 : #else
     896               0 :     } else if (ucs >= 0x10000) {
     897               0 :       if (ucs > 0x10ffff) {
     898               0 :   ucs = 0xfffd;
     899               0 :   goto J1;
     900                 :       }
     901                 : #endif
     902               0 :       if (count+4 >= dstlen) {dst[count] = 0; count += 4; break;}
     903               0 :       dst[count++] = 0xf0 | (char)(ucs >> 18);
     904               0 :       dst[count++] = 0x80 | (char)((ucs >> 12) & 0x3F);
     905               0 :       dst[count++] = 0x80 | (char)((ucs >> 6) & 0x3F);
     906               0 :       dst[count++] = 0x80 | (char)(ucs & 0x3F);
     907                 :     } else {
     908                 : #ifndef _WIN32
     909                 :     J1:
     910                 : #endif
     911                 :       // all others are 3 bytes:
     912               0 :       if (count+3 >= dstlen) {dst[count] = 0; count += 3; break;}
     913               0 :       dst[count++] = 0xe0 | (char)(ucs >> 12);
     914               0 :       dst[count++] = 0x80 | (char)((ucs >> 6) & 0x3F);
     915               0 :       dst[count++] = 0x80 | (char)(ucs & 0x3F);
     916                 :     }
     917                 :   }
     918                 :   // we filled dst, measure the rest:
     919               0 :   while (i < srclen) {
     920               0 :     unsigned ucs = src[i++];
     921               0 :     if (ucs < 0x80U) {
     922               0 :       count++;
     923               0 :     } else if (ucs < 0x800U) { // 2 bytes
     924               0 :       count += 2;
     925                 : #ifdef _WIN32
     926                 :     } else if (ucs >= 0xd800 && ucs <= 0xdbff && i < srclen-1 &&
     927                 :          src[i+1] >= 0xdc00 && src[i+1] <= 0xdfff) {
     928                 :       // surrogate pair
     929                 :       ++i;
     930                 : #else
     931               0 :     } else if (ucs >= 0x10000 && ucs <= 0x10ffff) {
     932                 : #endif
     933               0 :       count += 4;
     934                 :     } else {
     935               0 :       count += 3;
     936                 :     }
     937                 :   }
     938               0 :   return count;
     939                 : }
     940                 : 
     941                 : 
     942                 : /************************************************************************/
     943                 : /*                             utf8froma()                              */
     944                 : /************************************************************************/
     945                 : 
     946                 : /* Convert an ISO-8859-1 (ie normal c-string) byte stream to UTF-8.
     947                 : 
     948                 :     It is possible this should convert Microsoft's CP1252 to UTF-8
     949                 :     instead. This would translate the codes in the range 0x80-0x9f
     950                 :     to different characters. Currently it does not do this.
     951                 : 
     952                 :     Up to \a dstlen bytes are written to \a dst, including a null
     953                 :     terminator. The return value is the number of bytes that would be
     954                 :     written, not counting the null terminator. If greater or equal to
     955                 :     \a dstlen then if you malloc a new array of size n+1 you will have
     956                 :     the space needed for the entire string. If \a dstlen is zero then
     957                 :     nothing is written and this call just measures the storage space
     958                 :     needed.
     959                 : 
     960                 :     \a srclen is the number of bytes in \a src to convert.
     961                 : 
     962                 :     If the return value equals \a srclen then this indicates that
     963                 :     no conversion is necessary, as only ASCII characters are in the
     964                 :     string.
     965                 : */
     966           11312 : static unsigned utf8froma(char* dst, unsigned dstlen,
     967                 :                           const char* src, unsigned srclen) {
     968           11312 :   const char* p = src;
     969           11312 :   const char* e = src+srclen;
     970           11312 :   unsigned count = 0;
     971           86376 :   if (dstlen) for (;;) {
     972                 :     unsigned char ucs;
     973           86376 :     if (p >= e) {dst[count] = 0; return count;}
     974           75064 :     ucs = *(unsigned char*)p++;
     975           75064 :     if (ucs < 0x80U) {
     976           75032 :       dst[count++] = ucs;
     977           75032 :       if (count >= dstlen) {dst[count-1] = 0; break;}
     978                 :     } else { // 2 bytes (note that CP1252 translate could make 3 bytes!)
     979              32 :       if (count+2 >= dstlen) {dst[count] = 0; count += 2; break;}
     980              32 :       dst[count++] = 0xc0 | (ucs >> 6);
     981              32 :       dst[count++] = 0x80 | (ucs & 0x3F);
     982                 :     }
     983                 :   }
     984                 :   // we filled dst, measure the rest:
     985               0 :   while (p < e) {
     986               0 :     unsigned char ucs = *(unsigned char*)p++;
     987               0 :     if (ucs < 0x80U) {
     988               0 :       count++;
     989                 :     } else {
     990               0 :       count += 2;
     991                 :     }
     992                 :   }
     993               0 :   return count;
     994                 : }
     995                 : 
     996                 : /*
     997                 : ** For now we disable the rest which is locale() related.  We may need 
     998                 : ** parts of it later. 
     999                 : */
    1000                 : 
    1001                 : #ifdef notdef 
    1002                 : 
    1003                 : #ifdef _WIN32
    1004                 : # include <windows.h>
    1005                 : #endif
    1006                 : 
    1007                 : /*! Return true if the "locale" seems to indicate that UTF-8 encoding
    1008                 :     is used. If true the utf8tomb and utf8frommb don't do anything
    1009                 :     useful.
    1010                 : 
    1011                 :     <i>It is highly recommended that you change your system so this
    1012                 :     does return true.</i> On Windows this is done by setting the
    1013                 :     "codepage" to CP_UTF8.  On Unix this is done by setting $LC_CTYPE
    1014                 :     to a string containing the letters "utf" or "UTF" in it, or by
    1015                 :     deleting all $LC* and $LANG environment variables. In the future
    1016                 :     it is likely that all non-Asian Unix systems will return true,
    1017                 :     due to the compatability of UTF-8 with ISO-8859-1.
    1018                 : */
    1019                 : int utf8locale(void) {
    1020                 :   static int ret = 2;
    1021                 :   if (ret == 2) {
    1022                 : #ifdef _WIN32
    1023                 :     ret = GetACP() == CP_UTF8;
    1024                 : #else
    1025                 :     char* s;
    1026                 :     ret = 1; // assumme UTF-8 if no locale
    1027                 :     if (((s = getenv("LC_CTYPE")) && *s) ||
    1028                 :   ((s = getenv("LC_ALL"))   && *s) ||
    1029                 :   ((s = getenv("LANG"))     && *s)) {
    1030                 :       ret = (strstr(s,"utf") || strstr(s,"UTF"));
    1031                 :     }
    1032                 : #endif
    1033                 :   }
    1034                 :   return ret;
    1035                 : }
    1036                 : 
    1037                 : /*! Convert the UTF-8 used by FLTK to the locale-specific encoding
    1038                 :     used for filenames (and sometimes used for data in files).
    1039                 :     Unfortunatley due to stupid design you will have to do this as
    1040                 :     needed for filenames. This is a bug on both Unix and Windows.
    1041                 : 
    1042                 :     Up to \a dstlen bytes are written to \a dst, including a null
    1043                 :     terminator. The return value is the number of bytes that would be
    1044                 :     written, not counting the null terminator. If greater or equal to
    1045                 :     \a dstlen then if you malloc a new array of size n+1 you will have
    1046                 :     the space needed for the entire string. If \a dstlen is zero then
    1047                 :     nothing is written and this call just measures the storage space
    1048                 :     needed.
    1049                 : 
    1050                 :     If utf8locale() returns true then this does not change the data.
    1051                 :     It is copied and truncated as necessary to
    1052                 :     the destination buffer and \a srclen is always returned.  */
    1053                 : unsigned utf8tomb(const char* src, unsigned srclen,
    1054                 :       char* dst, unsigned dstlen)
    1055                 : {
    1056                 :   if (!utf8locale()) {
    1057                 : #ifdef _WIN32
    1058                 :     wchar_t lbuf[1024];
    1059                 :     wchar_t* buf = lbuf;
    1060                 :     unsigned length = utf8towc(src, srclen, buf, 1024);
    1061                 :     unsigned ret;
    1062                 :     if (length >= 1024) {
    1063                 :       buf = (wchar_t*)(malloc((length+1)*sizeof(wchar_t)));
    1064                 :       utf8towc(src, srclen, buf, length+1);
    1065                 :     }
    1066                 :     if (dstlen) {
    1067                 :       // apparently this does not null-terminate, even though msdn
    1068                 :       // documentation claims it does:
    1069                 :       ret =
    1070                 :         WideCharToMultiByte(GetACP(), 0, buf, length, dst, dstlen, 0, 0);
    1071                 :       dst[ret] = 0;
    1072                 :     }
    1073                 :     // if it overflows or measuring length, get the actual length:
    1074                 :     if (dstlen==0 || ret >= dstlen-1)
    1075                 :       ret =
    1076                 :   WideCharToMultiByte(GetACP(), 0, buf, length, 0, 0, 0, 0);
    1077                 :     if (buf != lbuf) free((void*)buf);
    1078                 :     return ret;
    1079                 : #else
    1080                 :     wchar_t lbuf[1024];
    1081                 :     wchar_t* buf = lbuf;
    1082                 :     unsigned length = utf8towc(src, srclen, buf, 1024);
    1083                 :     int ret;
    1084                 :     if (length >= 1024) {
    1085                 :       buf = (wchar_t*)(malloc((length+1)*sizeof(wchar_t)));
    1086                 :       utf8towc(src, srclen, buf, length+1);
    1087                 :     }
    1088                 :     if (dstlen) {
    1089                 :       ret = wcstombs(dst, buf, dstlen);
    1090                 :       if (ret >= dstlen-1) ret = wcstombs(0,buf,0);
    1091                 :     } else {
    1092                 :       ret = wcstombs(0,buf,0);
    1093                 :     }
    1094                 :     if (buf != lbuf) free((void*)buf);
    1095                 :     if (ret >= 0) return (unsigned)ret;
    1096                 :     // on any errors we return the UTF-8 as raw text...
    1097                 : #endif
    1098                 :   }
    1099                 :   // identity transform:
    1100                 :   if (srclen < dstlen) {
    1101                 :     memcpy(dst, src, srclen);
    1102                 :     dst[srclen] = 0;
    1103                 :   } else {
    1104                 :     memcpy(dst, src, dstlen-1);
    1105                 :     dst[dstlen-1] = 0;
    1106                 :   }
    1107                 :   return srclen;
    1108                 : }
    1109                 : 
    1110                 : /*! Convert a filename from the locale-specific multibyte encoding
    1111                 :     used by Windows to UTF-8 as used by FLTK.
    1112                 : 
    1113                 :     Up to \a dstlen bytes are written to \a dst, including a null
    1114                 :     terminator. The return value is the number of bytes that would be
    1115                 :     written, not counting the null terminator. If greater or equal to
    1116                 :     \a dstlen then if you malloc a new array of size n+1 you will have
    1117                 :     the space needed for the entire string. If \a dstlen is zero then
    1118                 :     nothing is written and this call just measures the storage space
    1119                 :     needed.
    1120                 : 
    1121                 :     On Unix or on Windows when a UTF-8 locale is in effect, this
    1122                 :     does not change the data. It is copied and truncated as necessary to
    1123                 :     the destination buffer and \a srclen is always returned.
    1124                 :     You may also want to check if utf8test() returns non-zero, so that
    1125                 :     the filesystem can store filenames in UTF-8 encoding regardless of
    1126                 :     the locale.
    1127                 : */
    1128                 : unsigned utf8frommb(char* dst, unsigned dstlen,
    1129                 :         const char* src, unsigned srclen)
    1130                 : {
    1131                 :   if (!utf8locale()) {
    1132                 : #ifdef _WIN32
    1133                 :     wchar_t lbuf[1024];
    1134                 :     wchar_t* buf = lbuf;
    1135                 :     unsigned length;
    1136                 :     unsigned ret;
    1137                 :     length =
    1138                 :       MultiByteToWideChar(GetACP(), 0, src, srclen, buf, 1024);
    1139                 :     if (length >= 1024) {
    1140                 :       length = MultiByteToWideChar(GetACP(), 0, src, srclen, 0, 0);
    1141                 :       buf = (wchar_t*)(malloc(length*sizeof(wchar_t)));
    1142                 :       MultiByteToWideChar(GetACP(), 0, src, srclen, buf, length);
    1143                 :     }
    1144                 :     ret = utf8fromwc(dst, dstlen, buf, length);
    1145                 :     if (buf != lbuf) free((void*)buf);
    1146                 :     return ret;
    1147                 : #else
    1148                 :     wchar_t lbuf[1024];
    1149                 :     wchar_t* buf = lbuf;
    1150                 :     int length;
    1151                 :     unsigned ret;
    1152                 :     length = mbstowcs(buf, src, 1024);
    1153                 :     if (length >= 1024) {
    1154                 :       length = mbstowcs(0, src, 0)+1;
    1155                 :       buf = (wchar_t*)(malloc(length*sizeof(unsigned short)));
    1156                 :       mbstowcs(buf, src, length);
    1157                 :     }
    1158                 :     if (length >= 0) {
    1159                 :       ret = utf8fromwc(dst, dstlen, buf, length);
    1160                 :       if (buf != lbuf) free((void*)buf);
    1161                 :       return ret;
    1162                 :     }
    1163                 :     // errors in conversion return the UTF-8 unchanged
    1164                 : #endif
    1165                 :   }
    1166                 :   // identity transform:
    1167                 :   if (srclen < dstlen) {
    1168                 :     memcpy(dst, src, srclen);
    1169                 :     dst[srclen] = 0;
    1170                 :   } else {
    1171                 :     memcpy(dst, src, dstlen-1);
    1172                 :     dst[dstlen-1] = 0;
    1173                 :   }
    1174                 :   return srclen;
    1175                 : }
    1176                 : 
    1177                 : #endif /* def notdef - disabled locale specific stuff */
    1178                 : 
    1179                 : /*! Examines the first \a srclen bytes in \a src and return a verdict
    1180                 :     on whether it is UTF-8 or not.
    1181                 :     - Returns 0 if there is any illegal UTF-8 sequences, using the
    1182                 :       same rules as utf8decode(). Note that some UCS values considered
    1183                 :       illegal by RFC 3629, such as 0xffff, are considered legal by this.
    1184                 :     - Returns 1 if there are only single-byte characters (ie no bytes
    1185                 :       have the high bit set). This is legal UTF-8, but also indicates
    1186                 :       plain ASCII. It also returns 1 if \a srclen is zero.
    1187                 :     - Returns 2 if there are only characters less than 0x800.
    1188                 :     - Returns 3 if there are only characters less than 0x10000.
    1189                 :     - Returns 4 if there are characters in the 0x10000 to 0x10ffff range.
    1190                 : 
    1191                 :     Because there are many illegal sequences in UTF-8, it is almost
    1192                 :     impossible for a string in another encoding to be confused with
    1193                 :     UTF-8. This is very useful for transitioning Unix to UTF-8
    1194                 :     filenames, you can simply test each filename with this to decide
    1195                 :     if it is UTF-8 or in the locale encoding. My hope is that if
    1196                 :     this is done we will be able to cleanly transition to a locale-less
    1197                 :     encoding.
    1198                 : */
    1199                 : 
    1200          951570 : static int utf8test(const char* src, unsigned srclen) {
    1201          951570 :   int ret = 1;
    1202          951570 :   const char* p = src;
    1203          951570 :   const char* e = src+srclen;
    1204         7443736 :   while (p < e) {
    1205         5540598 :     if (*p & 0x80) {
    1206             144 :       int len; utf8decode(p,e,&len);
    1207             144 :       if (len < 2) return 0;
    1208             142 :       if (len > ret) ret = len;
    1209             142 :       p += len;
    1210                 :     } else {
    1211         5540454 :       p++;
    1212                 :     }
    1213                 :   }
    1214          951568 :   return ret;
    1215                 : }
    1216                 : 
    1217                 : #endif /* defined(CPL_RECODE_STUB) */

Generated by: LCOV version 1.7