LCOV - code coverage report
Current view: directory - port - cpl_recode_stub.cpp (source / functions) Found Hit Coverage
Test: gdal_filtered.info Lines: 230 30 13.0 %
Date: 2010-01-09 Functions: 11 4 36.4 %

       1                 : /**********************************************************************
       2                 :  * $Id: cpl_recode_stub.cpp 17405 2009-07-17 06:13:24Z chaitanya $
       3                 :  *
       4                 :  * Name:     cpl_recode.cpp
       5                 :  * Project:  CPL - Common Portability Library
       6                 :  * Purpose:  Character set recoding and char/wchar_t conversions.
       7                 :  * Author:   Frank Warmerdam, warmerdam@pobox.com
       8                 :  *
       9                 :  * The bulk of this code is derived from the utf.c module from FLTK. It
      10                 :  * was originally downloaded from:
      11                 :  *    http://svn.easysw.com/public/fltk/fltk/trunk/src/utf.c
      12                 :  * 
      13                 :  **********************************************************************
      14                 :  * Copyright (c) 2008, Frank Warmerdam
      15                 :  * Copyright 2006 by Bill Spitzak and others.
      16                 :  *
      17                 :  * Permission to use, copy, modify, and distribute this software for any
      18                 :  * purpose with or without fee is hereby granted, provided that the above
      19                 :  * copyright notice and this permission notice appear in all copies.
      20                 :  *
      21                 :  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
      22                 :  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
      23                 :  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
      24                 :  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
      25                 :  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
      26                 :  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
      27                 :  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
      28                 :  **********************************************************************/
      29                 : 
      30                 : #include "cpl_string.h"
      31                 : 
      32                 : CPL_CVSID("$Id: cpl_recode_stub.cpp 17405 2009-07-17 06:13:24Z chaitanya $");
      33                 : 
      34                 : #define CPL_RECODE_STUB
      35                 : 
      36                 : #ifdef CPL_RECODE_STUB 
      37                 : 
      38                 : static unsigned utf8decode(const char* p, const char* end, int* len);
      39                 : static unsigned utf8towc(const char* src, unsigned srclen,
      40                 :                          wchar_t* dst, unsigned dstlen);
      41                 : static unsigned utf8toa(const char* src, unsigned srclen,
      42                 :                         char* dst, unsigned dstlen);
      43                 : static unsigned utf8fromwc(char* dst, unsigned dstlen,
      44                 :                            const wchar_t* src, unsigned srclen);
      45                 : static unsigned utf8froma(char* dst, unsigned dstlen,
      46                 :                           const char* src, unsigned srclen);
      47                 : static int utf8test(const char* src, unsigned srclen);
      48                 : 
      49                 : #ifdef FUTURE_NEEDS
      50                 : static const char* utf8fwd(const char* p, const char* start, const char* end);
      51                 : static const char* utf8back(const char* p, const char* start, const char*end);
      52                 : static int utf8encode(unsigned ucs, char* buf);
      53                 : static int utf8bytes(unsigned ucs);
      54                 : #endif /* def FUTURE_NEEDS */
      55                 : 
      56                 : /************************************************************************/
      57                 : /* ==================================================================== */
      58                 : /*  Stub Implementation not depending on iconv() or WIN32 API.  */
      59                 : /* ==================================================================== */
      60                 : /************************************************************************/
      61                 : 
      62                 : /************************************************************************/
      63                 : /*                             CPLRecode()                              */
      64                 : /************************************************************************/
      65                 : 
      66                 : /**
      67                 :  * Convert a string from a source encoding to a destination encoding.
      68                 :  *
      69                 :  * The only guaranteed supported encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
      70                 :  * and CPL_ENC_ISO8859_1. Currently, the following conversions are supported :
      71                 :  * <ul>
      72                 :  *  <li>CPL_ENC_ASCII -> CPL_ENC_UTF8 or CPL_ENC_ISO8859_1 (no conversion in fact)</li>
      73                 :  *  <li>CPL_ENC_ISO8859_1 -> CPL_ENC_UTF8</li>
      74                 :  *  <li>CPL_ENC_UTF8 -> CPL_ENC_ISO8859_1</li>
      75                 :  * </ul>
      76                 :  *
      77                 :  * If an error occurs an error may, or may not be posted with CPLError(). 
      78                 :  *
      79                 :  * @param pszSource a NUL terminated string.
      80                 :  * @param pszSrcEncoding the source encoding.
      81                 :  * @param pszDstEncoding the destination encoding.
      82                 :  *
      83                 :  * @return a NUL terminated string which should be freed with CPLFree().
      84                 :  *
      85                 :  * @since GDAL 1.6.0
      86                 :  */
      87                 : 
      88               0 : char CPL_DLL *CPLRecode( const char *pszSource, 
      89                 :                          const char *pszSrcEncoding, 
      90                 :                          const char *pszDstEncoding )
      91                 : 
      92                 : {
      93                 : /* -------------------------------------------------------------------- */
      94                 : /*      Handle a few common short cuts.                                 */
      95                 : /* -------------------------------------------------------------------- */
      96               0 :     if( strcmp(pszSrcEncoding,pszDstEncoding) == 0 )
      97               0 :         return CPLStrdup(pszSource);
      98                 : 
      99               0 :     if( strcmp(pszSrcEncoding,CPL_ENC_ASCII) == 0 
     100                 :         && (strcmp(pszDstEncoding,CPL_ENC_UTF8) == 0 
     101                 :             || strcmp(pszDstEncoding,CPL_ENC_ISO8859_1) == 0) )
     102               0 :         return CPLStrdup(pszSource);
     103                 : 
     104                 : /* -------------------------------------------------------------------- */
     105                 : /*      If the source or destination is current locale(), we change     */
     106                 : /*      it to ISO8859-1 since our stub implementation does not          */
     107                 : /*      attempt to address locales properly.                            */
     108                 : /* -------------------------------------------------------------------- */
     109                 : 
     110               0 :     if( pszSrcEncoding[0] == '\0' )
     111               0 :         pszSrcEncoding = CPL_ENC_ISO8859_1;
     112                 : 
     113               0 :     if( pszDstEncoding[0] == '\0' )
     114               0 :         pszDstEncoding = CPL_ENC_ISO8859_1;
     115                 : 
     116                 : /* -------------------------------------------------------------------- */
     117                 : /*      ISO8859 to UTF8                                                 */
     118                 : /* -------------------------------------------------------------------- */
     119               0 :     if( strcmp(pszSrcEncoding,CPL_ENC_ISO8859_1) == 0 
     120                 :         && strcmp(pszDstEncoding,CPL_ENC_UTF8) == 0 )
     121                 :     {
     122               0 :         int nCharCount = strlen(pszSource);
     123               0 :         char *pszResult = (char *) CPLCalloc(1,nCharCount*2+1);
     124                 :         
     125               0 :         utf8froma( pszResult, nCharCount*2+1, pszSource, nCharCount );
     126                 :         
     127               0 :         return pszResult;
     128                 :     }
     129                 : 
     130                 : /* -------------------------------------------------------------------- */
     131                 : /*      UTF8 to ISO8859                                                 */
     132                 : /* -------------------------------------------------------------------- */
     133               0 :     if( strcmp(pszSrcEncoding,CPL_ENC_UTF8) == 0 
     134                 :         && strcmp(pszDstEncoding,CPL_ENC_ISO8859_1) == 0 )
     135                 :     {
     136               0 :         int nCharCount = strlen(pszSource);
     137               0 :         char *pszResult = (char *) CPLCalloc(1,nCharCount+1);
     138                 :         
     139               0 :         utf8toa( pszSource, nCharCount, pszResult, nCharCount+1 );
     140                 :         
     141               0 :         return pszResult;
     142                 :     }
     143                 : 
     144                 : /* -------------------------------------------------------------------- */
     145                 : /*      Anything else to UTF-8 is treated as ISO8859-1 to UTF-8 with    */
     146                 : /*      a one-time warning.                                             */
     147                 : /* -------------------------------------------------------------------- */
     148               0 :     if( strcmp(pszDstEncoding,CPL_ENC_UTF8) == 0 )
     149                 :     {
     150               0 :         int nCharCount = strlen(pszSource);
     151               0 :         char *pszResult = (char *) CPLCalloc(1,nCharCount*2+1);
     152                 :         static int bHaveWarned = FALSE;
     153                 : 
     154               0 :         if( !bHaveWarned )
     155                 :         {
     156               0 :             bHaveWarned = 1;
     157                 :             CPLError( CE_Warning, CPLE_AppDefined, 
     158                 :                       "Recode from %s to UTF-8 not supported, treated as ISO8859-1 to UTF-8.", 
     159               0 :                       pszSrcEncoding );
     160                 :         }
     161                 : 
     162               0 :         utf8froma( pszResult, nCharCount*2+1, pszSource, nCharCount );
     163                 :         
     164               0 :         return pszResult;
     165                 :     }
     166                 : 
     167                 : /* -------------------------------------------------------------------- */
     168                 : /*      UTF-8 to anything else is treated as UTF-8 to ISO-8859-1        */
     169                 : /*      with a warning.                                                 */
     170                 : /* -------------------------------------------------------------------- */
     171               0 :     if( strcmp(pszSrcEncoding,CPL_ENC_UTF8) == 0 
     172                 :         && strcmp(pszDstEncoding,CPL_ENC_ISO8859_1) == 0 )
     173                 :     {
     174               0 :         int nCharCount = strlen(pszSource);
     175               0 :         char *pszResult = (char *) CPLCalloc(1,nCharCount+1);
     176                 :         static int bHaveWarned = FALSE;
     177                 : 
     178               0 :         if( !bHaveWarned )
     179                 :         {
     180               0 :             bHaveWarned = 1;
     181                 :             CPLError( CE_Warning, CPLE_AppDefined, 
     182                 :                       "Recode from UTF-8 to %s not supported, treated as UTF-8 to ISO8859-1.", 
     183               0 :                       pszDstEncoding );
     184                 :         }
     185                 :         
     186               0 :         utf8toa( pszSource, nCharCount, pszResult, nCharCount+1 );
     187                 :         
     188               0 :         return pszResult;
     189                 :     }
     190                 : 
     191                 : /* -------------------------------------------------------------------- */
     192                 : /*      Everything else is treated as a no-op with a warning.           */
     193                 : /* -------------------------------------------------------------------- */
     194                 :     {
     195                 :         static int bHaveWarned = FALSE;
     196                 : 
     197               0 :         if( !bHaveWarned )
     198                 :         {
     199               0 :             bHaveWarned = 1;
     200                 :             CPLError( CE_Warning, CPLE_AppDefined, 
     201                 :                       "Recode from %s to %s not supported, no change applied.", 
     202               0 :                       pszSrcEncoding, pszDstEncoding );
     203                 :         }
     204                 :         
     205               0 :         return CPLStrdup(pszSource);
     206                 :     }
     207                 : }
     208                 : 
     209                 : /************************************************************************/
     210                 : /*                         CPLRecodeFromWChar()                         */
     211                 : /************************************************************************/
     212                 : 
     213                 : /**
     214                 :  * Convert wchar_t string to UTF-8. 
     215                 :  *
     216                 :  * Convert a wchar_t string into a multibyte utf-8 string.  The only
     217                 :  * guaranteed supported source encoding is CPL_ENC_UCS2, and the only
     218                 :  * guaranteed supported destination encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
     219                 :  * and CPL_ENC_ISO8859_1.  In some cases (ie. using iconv()) other encodings 
     220                 :  * may also be supported.
     221                 :  *
     222                 :  * Note that the wchar_t type varies in size on different systems. On
     223                 :  * win32 it is normally 2 bytes, and on unix 4 bytes.
     224                 :  *
     225                 :  * If an error occurs an error may, or may not be posted with CPLError(). 
     226                 :  *
     227                 :  * @param pwszSource the source wchar_t string, terminated with a 0 wchar_t.
     228                 :  * @param pszSrcEncoding the source encoding, typically CPL_ENC_UCS2.
     229                 :  * @param pszDstEncoding the destination encoding, typically CPL_ENC_UTF8.
     230                 :  *
     231                 :  * @return a zero terminated multi-byte string which should be freed with 
     232                 :  * CPLFree(), or NULL if an error occurs. 
     233                 :  *
     234                 :  * @since GDAL 1.6.0
     235                 :  */
     236                 : 
     237               0 : char CPL_DLL *CPLRecodeFromWChar( const wchar_t *pwszSource, 
     238                 :                                   const char *pszSrcEncoding, 
     239                 :                                   const char *pszDstEncoding )
     240                 : 
     241                 : {
     242                 : /* -------------------------------------------------------------------- */
     243                 : /*      We try to avoid changes of character set.  We are just          */
     244                 : /*      providing for unicode to unicode.                               */
     245                 : /* -------------------------------------------------------------------- */
     246               0 :     if( strcmp(pszSrcEncoding,CPL_ENC_UTF8) != 0
     247                 :         && strcmp(pszSrcEncoding,CPL_ENC_UTF16) != 0
     248                 :         && strcmp(pszSrcEncoding,CPL_ENC_UCS2) != 0
     249                 :         && strcmp(pszSrcEncoding,CPL_ENC_UCS4) != 0 )
     250                 :     {
     251                 :         CPLError( CE_Failure, CPLE_AppDefined,
     252                 :                   "Stub recoding implementation does not support\n"
     253                 :                   "CPLRecodeFromWChar(...,%s,%s)", 
     254               0 :                   pszSrcEncoding, pszDstEncoding );
     255               0 :         return NULL;
     256                 :     }
     257                 : 
     258                 : /* -------------------------------------------------------------------- */
     259                 : /*      What is the source length.                                      */
     260                 : /* -------------------------------------------------------------------- */
     261               0 :     int nSrcLen = 0;
     262                 : 
     263               0 :     while( pwszSource[nSrcLen] != 0 )
     264               0 :         nSrcLen++;
     265                 : 
     266                 : /* -------------------------------------------------------------------- */
     267                 : /*      Allocate destination buffer plenty big.                         */
     268                 : /* -------------------------------------------------------------------- */
     269                 :     char *pszResult;
     270                 :     int nDstBufSize, nDstLen;
     271                 : 
     272               0 :     nDstBufSize = nSrcLen * 4 + 1;
     273               0 :     pszResult = (char *) CPLMalloc(nDstBufSize); // nearly worst case.
     274                 : 
     275                 : /* -------------------------------------------------------------------- */
     276                 : /*      Convert, and confirm we had enough space.                       */
     277                 : /* -------------------------------------------------------------------- */
     278               0 :     nDstLen = utf8fromwc( pszResult, nDstBufSize, pwszSource, nSrcLen );
     279               0 :     if( nDstLen >= nDstBufSize - 1 )
     280                 :     {
     281                 :         CPLAssert( FALSE ); // too small!
     282               0 :         return NULL;
     283                 :     }
     284                 : 
     285                 : /* -------------------------------------------------------------------- */
     286                 : /*      If something other than UTF-8 was requested, recode now.        */
     287                 : /* -------------------------------------------------------------------- */
     288               0 :     if( strcmp(pszDstEncoding,CPL_ENC_UTF8) == 0 )
     289               0 :         return pszResult;
     290                 : 
     291                 :     char *pszFinalResult = 
     292               0 :         CPLRecode( pszResult, CPL_ENC_UTF8, pszDstEncoding );
     293                 : 
     294               0 :     CPLFree( pszResult );
     295                 :     
     296               0 :     return pszFinalResult;
     297                 : }
     298                 : 
     299                 : /************************************************************************/
     300                 : /*                          CPLRecodeToWChar()                          */
     301                 : /************************************************************************/
     302                 : 
     303                 : /**
     304                 :  * Convert UTF-8 string to a wchar_t string.
     305                 :  *
     306                 :  * Convert a 8bit, multi-byte per character input string into a wide
     307                 :  * character (wchar_t) string.  The only guaranteed supported source encodings
     308                 :  * are CPL_ENC_UTF8, CPL_ENC_ASCII and CPL_ENC_ISO8869_1 (LATIN1).  The only
     309                 :  * guaranteed supported destination encoding is CPL_ENC_UCS2.  Other source
     310                 :  * and destination encodings may be supported depending on the underlying
     311                 :  * implementation. 
     312                 :  *
     313                 :  * Note that the wchar_t type varies in size on different systems. On
     314                 :  * win32 it is normally 2 bytes, and on unix 4 bytes.
     315                 :  *
     316                 :  * If an error occurs an error may, or may not be posted with CPLError(). 
     317                 :  *
     318                 :  * @param pszSource input multi-byte character string.
     319                 :  * @param pszSrcEncoding source encoding, typically CPL_ENC_UTF8.
     320                 :  * @param pszDstEncoding destination encoding, typically CPL_ENC_UCS2. 
     321                 :  *
     322                 :  * @return the zero terminated wchar_t string (to be freed with CPLFree()) or
     323                 :  * NULL on error.
     324                 :  *
     325                 :  * @since GDAL 1.6.0
     326                 :  */
     327                 : 
     328               0 : wchar_t CPL_DLL *CPLRecodeToWChar( const char *pszSource,
     329                 :                                    const char *pszSrcEncoding, 
     330                 :                                    const char *pszDstEncoding )
     331                 : 
     332                 : {
     333               0 :     char *pszUTF8Source = (char *) pszSource;
     334                 : 
     335               0 :     if( strcmp(pszSrcEncoding,CPL_ENC_UTF8) != 0 
     336                 :         && strcmp(pszSrcEncoding,CPL_ENC_ASCII) != 0 )
     337                 :     {
     338               0 :         pszUTF8Source = CPLRecode( pszSource, pszSrcEncoding, CPL_ENC_UTF8 );
     339               0 :         if( pszUTF8Source == NULL )
     340               0 :             return NULL;
     341                 :     }
     342                 : 
     343                 : /* -------------------------------------------------------------------- */
     344                 : /*      We try to avoid changes of character set.  We are just          */
     345                 : /*      providing for unicode to unicode.                               */
     346                 : /* -------------------------------------------------------------------- */
     347               0 :     if( strcmp(pszDstEncoding,CPL_ENC_UCS2) != 0
     348                 :         && strcmp(pszDstEncoding,CPL_ENC_UCS4) != 0 
     349                 :         && strcmp(pszDstEncoding,CPL_ENC_UTF16) != 0 )
     350                 :     {
     351                 :         CPLError( CE_Failure, CPLE_AppDefined,
     352                 :                   "Stub recoding implementation does not support\n"
     353                 :                   "CPLRecodeToWChar(...,%s,%s)", 
     354               0 :                   pszSrcEncoding, pszDstEncoding );
     355               0 :         return NULL;
     356                 :     }
     357                 : 
     358                 : /* -------------------------------------------------------------------- */
     359                 : /*      Do the UTF-8 to UCS-2 recoding.                                 */
     360                 : /* -------------------------------------------------------------------- */
     361               0 :     int nSrcLen = strlen(pszUTF8Source);
     362               0 :     wchar_t *pwszResult = (wchar_t *) CPLCalloc(sizeof(wchar_t),nSrcLen+1);
     363                 : 
     364               0 :     utf8towc( pszUTF8Source, nSrcLen, pwszResult, nSrcLen+1 );
     365                 : 
     366               0 :     if( pszUTF8Source != pszSource )
     367               0 :         CPLFree( pszUTF8Source );
     368                 : 
     369               0 :     return pwszResult;
     370                 : }
     371                 : 
     372                 : 
     373                 : /************************************************************************/
     374                 : /*                                 CPLIsUTF8()                          */
     375                 : /************************************************************************/
     376                 : 
     377                 : /**
     378                 :  * Test if a string is encoded as UTF-8.
     379                 :  *
     380                 :  * @param pabyData input string to test
     381                 :  * @param nLen length of the input string, or -1 if the function must compute
     382                 :  *             the string length. In which case it must be null terminated.
     383                 :  * @return TRUE if the string is encoded as UTF-8. FALSE otherwise
     384                 :  *
     385                 :  * @since GDAL 1.7.0
     386                 :  */
     387              94 : int CPLIsUTF8(const char* pabyData, int nLen)
     388                 : {
     389              94 :     if (nLen < 0)
     390              94 :         nLen = strlen(pabyData);
     391              94 :     return utf8test(pabyData, (unsigned)nLen) != 0;
     392                 : }
     393                 : 
     394                 : /************************************************************************/
     395                 : /*                          CPLForceToASCII()                           */
     396                 : /************************************************************************/
     397                 : 
     398                 : /**
     399                 :  * Return a new string that is made only of ASCII characters. If non-ASCII
     400                 :  * characters are found in the input string, they will be replaced by the
     401                 :  * provided replacement character.
     402                 :  *
     403                 :  * @param pabyData input string to test
     404                 :  * @param nLen length of the input string, or -1 if the function must compute
     405                 :  *             the string length. In which case it must be null terminated.
     406                 :  * @param chReplacementChar character which will be used when the input stream
     407                 :  *                          contains a non ASCII character. Must be valid ASCII !
     408                 :  *
     409                 :  * @return a new string that must be freed with CPLFree().
     410                 :  *
     411                 :  * @since GDAL 1.7.0
     412                 :  */
     413               1 : char CPL_DLL *CPLForceToASCII(const char* pabyData, int nLen, char chReplacementChar)
     414                 : {
     415               1 :     if (nLen < 0)
     416               1 :         nLen = strlen(pabyData);
     417               1 :     char* pszOutputString = (char*)CPLMalloc(nLen + 1);
     418                 :     int i;
     419               5 :     for(i=0;i<nLen;i++)
     420                 :     {
     421               4 :         if (((unsigned char*)pabyData)[i] > 127)
     422               1 :             pszOutputString[i] = chReplacementChar;
     423                 :         else
     424               3 :             pszOutputString[i] = pabyData[i];
     425                 :     }
     426               1 :     pszOutputString[i] = '\0';
     427               1 :     return pszOutputString;
     428                 : }
     429                 : 
     430                 : 
     431                 : /************************************************************************/
     432                 : /* ==================================================================== */
     433                 : /*  UTF.C code from FLTK with some modifications.                   */
     434                 : /* ==================================================================== */
     435                 : /************************************************************************/
     436                 : 
     437                 : /* Set to 1 to turn bad UTF8 bytes into ISO-8859-1. If this is to zero
     438                 :    they are instead turned into the Unicode REPLACEMENT CHARACTER, of
     439                 :    value 0xfffd.
     440                 :    If this is on utf8decode will correctly map most (perhaps all)
     441                 :    human-readable text that is in ISO-8859-1. This may allow you
     442                 :    to completely ignore character sets in your code because virtually
     443                 :    everything is either ISO-8859-1 or UTF-8.
     444                 : */
     445                 : #define ERRORS_TO_ISO8859_1 1
     446                 : 
     447                 : /* Set to 1 to turn bad UTF8 bytes in the 0x80-0x9f range into the
     448                 :    Unicode index for Microsoft's CP1252 character set. You should
     449                 :    also set ERRORS_TO_ISO8859_1. With this a huge amount of more
     450                 :    available text (such as all web pages) are correctly converted
     451                 :    to Unicode.
     452                 : */
     453                 : #define ERRORS_TO_CP1252 1
     454                 : 
     455                 : /* A number of Unicode code points are in fact illegal and should not
     456                 :    be produced by a UTF-8 converter. Turn this on will replace the
     457                 :    bytes in those encodings with errors. If you do this then converting
     458                 :    arbitrary 16-bit data to UTF-8 and then back is not an identity,
     459                 :    which will probably break a lot of software.
     460                 : */
     461                 : #define STRICT_RFC3629 0
     462                 : 
     463                 : #if ERRORS_TO_CP1252
     464                 : // Codes 0x80..0x9f from the Microsoft CP1252 character set, translated
     465                 : // to Unicode:
     466                 : static unsigned short cp1252[32] = {
     467                 :   0x20ac, 0x0081, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,
     468                 :   0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008d, 0x017d, 0x008f,
     469                 :   0x0090, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
     470                 :   0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x009d, 0x017e, 0x0178
     471                 : };
     472                 : #endif
     473                 : 
     474                 : /************************************************************************/
     475                 : /*                             utf8decode()                             */
     476                 : /************************************************************************/
     477                 : 
     478                 : /*
     479                 :     Decode a single UTF-8 encoded character starting at \e p. The
     480                 :     resulting Unicode value (in the range 0-0x10ffff) is returned,
     481                 :     and \e len is set the the number of bytes in the UTF-8 encoding
     482                 :     (adding \e len to \e p will point at the next character).
     483                 : 
     484                 :     If \a p points at an illegal UTF-8 encoding, including one that
     485                 :     would go past \e end, or where a code is uses more bytes than
     486                 :     necessary, then *(unsigned char*)p is translated as though it is
     487                 :     in the Microsoft CP1252 character set and \e len is set to 1.
     488                 :     Treating errors this way allows this to decode almost any
     489                 :     ISO-8859-1 or CP1252 text that has been mistakenly placed where
     490                 :     UTF-8 is expected, and has proven very useful.
     491                 : 
     492                 :     If you want errors to be converted to error characters (as the
     493                 :     standards recommend), adding a test to see if the length is
     494                 :     unexpectedly 1 will work:
     495                 : 
     496                 : \code
     497                 :     if (*p & 0x80) { // what should be a multibyte encoding
     498                 :       code = utf8decode(p,end,&len);
     499                 :       if (len<2) code = 0xFFFD; // Turn errors into REPLACEMENT CHARACTER
     500                 :     } else { // handle the 1-byte utf8 encoding:
     501                 :       code = *p;
     502                 :       len = 1;
     503                 :     }
     504                 : \endcode
     505                 : 
     506                 :     Direct testing for the 1-byte case (as shown above) will also
     507                 :     speed up the scanning of strings where the majority of characters
     508                 :     are ASCII.
     509                 : */
     510               1 : static unsigned utf8decode(const char* p, const char* end, int* len)
     511                 : {
     512               1 :   unsigned char c = *(unsigned char*)p;
     513               1 :   if (c < 0x80) {
     514               0 :     *len = 1;
     515               0 :     return c;
     516                 : #if ERRORS_TO_CP1252
     517               1 :   } else if (c < 0xa0) {
     518               1 :     *len = 1;
     519               1 :     return cp1252[c-0x80];
     520                 : #endif
     521               0 :   } else if (c < 0xc2) {
     522               0 :     goto FAIL;
     523                 :   }
     524               0 :   if (p+1 >= end || (p[1]&0xc0) != 0x80) goto FAIL;
     525               0 :   if (c < 0xe0) {
     526               0 :     *len = 2;
     527                 :     return
     528               0 :       ((p[0] & 0x1f) << 6) +
     529               0 :       ((p[1] & 0x3f));
     530               0 :   } else if (c == 0xe0) {
     531               0 :     if (((unsigned char*)p)[1] < 0xa0) goto FAIL;
     532               0 :     goto UTF8_3;
     533                 : #if STRICT_RFC3629
     534                 :   } else if (c == 0xed) {
     535                 :     // RFC 3629 says surrogate chars are illegal.
     536                 :     if (((unsigned char*)p)[1] >= 0xa0) goto FAIL;
     537                 :     goto UTF8_3;
     538                 :   } else if (c == 0xef) {
     539                 :     // 0xfffe and 0xffff are also illegal characters
     540                 :     if (((unsigned char*)p)[1]==0xbf &&
     541                 :   ((unsigned char*)p)[2]>=0xbe) goto FAIL;
     542                 :     goto UTF8_3;
     543                 : #endif
     544               0 :   } else if (c < 0xf0) {
     545                 :   UTF8_3:
     546               0 :     if (p+2 >= end || (p[2]&0xc0) != 0x80) goto FAIL;
     547               0 :     *len = 3;
     548                 :     return
     549               0 :       ((p[0] & 0x0f) << 12) +
     550               0 :       ((p[1] & 0x3f) << 6) +
     551               0 :       ((p[2] & 0x3f));
     552               0 :   } else if (c == 0xf0) {
     553               0 :     if (((unsigned char*)p)[1] < 0x90) goto FAIL;
     554               0 :     goto UTF8_4;
     555               0 :   } else if (c < 0xf4) {
     556                 :   UTF8_4:
     557               0 :     if (p+3 >= end || (p[2]&0xc0) != 0x80 || (p[3]&0xc0) != 0x80) goto FAIL;
     558               0 :     *len = 4;
     559                 : #if STRICT_RFC3629
     560                 :     // RFC 3629 says all codes ending in fffe or ffff are illegal:
     561                 :     if ((p[1]&0xf)==0xf &&
     562                 :   ((unsigned char*)p)[2] == 0xbf &&
     563                 :   ((unsigned char*)p)[3] >= 0xbe) goto FAIL;
     564                 : #endif
     565                 :     return
     566               0 :       ((p[0] & 0x07) << 18) +
     567               0 :       ((p[1] & 0x3f) << 12) +
     568               0 :       ((p[2] & 0x3f) << 6) +
     569               0 :       ((p[3] & 0x3f));
     570               0 :   } else if (c == 0xf4) {
     571               0 :     if (((unsigned char*)p)[1] > 0x8f) goto FAIL; // after 0x10ffff
     572               0 :     goto UTF8_4;
     573                 :   } else {
     574                 :   FAIL:
     575               0 :     *len = 1;
     576                 : #if ERRORS_TO_ISO8859_1
     577               0 :     return c;
     578                 : #else
     579                 :     return 0xfffd; // Unicode REPLACEMENT CHARACTER
     580                 : #endif
     581                 :   }
     582                 : }
     583                 : 
     584                 : /************************************************************************/
     585                 : /*                              utf8fwd()                               */
     586                 : /************************************************************************/
     587                 : 
     588                 : /*
     589                 :   Move \a p forward until it points to the start of a UTF-8
     590                 :   character. If it already points at the start of one then it
     591                 :   is returned unchanged. Any UTF-8 errors are treated as though each
     592                 :   byte of the error is an individual character.
     593                 : 
     594                 :   \e start is the start of the string and is used to limit the
     595                 :   backwards search for the start of a utf8 character.
     596                 : 
     597                 :   \e end is the end of the string and is assummed to be a break
     598                 :   between characters. It is assummed to be greater than p.
     599                 : 
     600                 :   This function is for moving a pointer that was jumped to the
     601                 :   middle of a string, such as when doing a binary search for
     602                 :   a position. You should use either this or utf8back() depending
     603                 :   on which direction your algorithim can handle the pointer
     604                 :   moving. Do not use this to scan strings, use utf8decode()
     605                 :   instead.
     606                 : */
     607                 : 
     608                 : #ifdef FUTURE_NEEDS
     609                 : static const char* utf8fwd(const char* p, const char* start, const char* end)
     610                 : {
     611                 :   const char* a;
     612                 :   int len;
     613                 :   // if we are not pointing at a continuation character, we are done:
     614                 :   if ((*p&0xc0) != 0x80) return p;
     615                 :   // search backwards for a 0xc0 starting the character:
     616                 :   for (a = p-1; ; --a) {
     617                 :     if (a < start) return p;
     618                 :     if (!(a[0]&0x80)) return p;
     619                 :     if ((a[0]&0x40)) break;
     620                 :   }
     621                 :   utf8decode(a,end,&len);
     622                 :   a += len;
     623                 :   if (a > p) return a;
     624                 :   return p;
     625                 : }
     626                 : #endif /* def FUTURE_NEEDS */
     627                 : 
     628                 : /************************************************************************/
     629                 : /*                              utf8back()                              */
     630                 : /************************************************************************/
     631                 : 
     632                 : /*
     633                 :   Move \a p backward until it points to the start of a UTF-8
     634                 :   character. If it already points at the start of one then it
     635                 :   is returned unchanged. Any UTF-8 errors are treated as though each
     636                 :   byte of the error is an individual character.
     637                 : 
     638                 :   \e start is the start of the string and is used to limit the
     639                 :   backwards search for the start of a UTF-8 character.
     640                 : 
     641                 :   \e end is the end of the string and is assummed to be a break
     642                 :   between characters. It is assummed to be greater than p.
     643                 : 
     644                 :   If you wish to decrement a UTF-8 pointer, pass p-1 to this.
     645                 : */
     646                 : 
     647                 : #ifdef FUTURE_NEEDS
     648                 : static const char* utf8back(const char* p, const char* start, const char* end)
     649                 : {
     650                 :   const char* a;
     651                 :   int len;
     652                 :   // if we are not pointing at a continuation character, we are done:
     653                 :   if ((*p&0xc0) != 0x80) return p;
     654                 :   // search backwards for a 0xc0 starting the character:
     655                 :   for (a = p-1; ; --a) {
     656                 :     if (a < start) return p;
     657                 :     if (!(a[0]&0x80)) return p;
     658                 :     if ((a[0]&0x40)) break;
     659                 :   }
     660                 :   utf8decode(a,end,&len);
     661                 :   if (a+len > p) return a;
     662                 :   return p;
     663                 : }
     664                 : #endif /* def FUTURE_NEEDS */
     665                 : 
     666                 : /************************************************************************/
     667                 : /*                             utf8bytes()                              */
     668                 : /************************************************************************/
     669                 : 
     670                 : /* Returns number of bytes that utf8encode() will use to encode the
     671                 :   character \a ucs. */
     672                 : #ifdef FUTURE_NEEDS
     673                 : static int utf8bytes(unsigned ucs) {
     674                 :   if (ucs < 0x000080U) {
     675                 :     return 1;
     676                 :   } else if (ucs < 0x000800U) {
     677                 :     return 2;
     678                 :   } else if (ucs < 0x010000U) {
     679                 :     return 3;
     680                 :   } else if (ucs < 0x10ffffU) {
     681                 :     return 4;
     682                 :   } else {
     683                 :     return 3; // length of the illegal character encoding
     684                 :   }
     685                 : }
     686                 : #endif /* def FUTURE_NEEDS */
     687                 : 
     688                 : /************************************************************************/
     689                 : /*                             utf8encode()                             */
     690                 : /************************************************************************/
     691                 : 
     692                 : /* Write the UTF-8 encoding of \e ucs into \e buf and return the
     693                 :     number of bytes written. Up to 4 bytes may be written. If you know
     694                 :     that \a ucs is less than 0x10000 then at most 3 bytes will be written.
     695                 :     If you wish to speed this up, remember that anything less than 0x80
     696                 :     is written as a single byte.
     697                 : 
     698                 :     If ucs is greater than 0x10ffff this is an illegal character
     699                 :     according to RFC 3629. These are converted as though they are
     700                 :     0xFFFD (REPLACEMENT CHARACTER).
     701                 : 
     702                 :     RFC 3629 also says many other values for \a ucs are illegal (in
     703                 :     the range 0xd800 to 0xdfff, or ending with 0xfffe or
     704                 :     0xffff). However I encode these as though they are legal, so that
     705                 :     utf8encode/utf8decode will be the identity for all codes between 0
     706                 :     and 0x10ffff.
     707                 : */
     708                 : #ifdef FUTURE_NEEDS
     709                 : static int utf8encode(unsigned ucs, char* buf) {
     710                 :   if (ucs < 0x000080U) {
     711                 :     buf[0] = ucs;
     712                 :     return 1;
     713                 :   } else if (ucs < 0x000800U) {
     714                 :     buf[0] = 0xc0 | (ucs >> 6);
     715                 :     buf[1] = 0x80 | (ucs & 0x3F);
     716                 :     return 2;
     717                 :   } else if (ucs < 0x010000U) {
     718                 :     buf[0] = 0xe0 | (ucs >> 12);
     719                 :     buf[1] = 0x80 | ((ucs >> 6) & 0x3F);
     720                 :     buf[2] = 0x80 | (ucs & 0x3F);
     721                 :     return 3;
     722                 :   } else if (ucs < 0x0010ffffU) {
     723                 :     buf[0] = 0xf0 | (ucs >> 18);
     724                 :     buf[1] = 0x80 | ((ucs >> 12) & 0x3F);
     725                 :     buf[2] = 0x80 | ((ucs >> 6) & 0x3F);
     726                 :     buf[3] = 0x80 | (ucs & 0x3F);
     727                 :     return 4;
     728                 :   } else {
     729                 :     // encode 0xfffd:
     730                 :     buf[0] = 0xefU;
     731                 :     buf[1] = 0xbfU;
     732                 :     buf[2] = 0xbdU;
     733                 :     return 3;
     734                 :   }
     735                 : }
     736                 : #endif /* def FUTURE_NEEDS */
     737                 : 
     738                 : /************************************************************************/
     739                 : /*                              utf8towc()                              */
     740                 : /************************************************************************/
     741                 : 
     742                 : /*  Convert a UTF-8 sequence into an array of wchar_t. These
     743                 :     are used by some system calls, especially on Windows.
     744                 : 
     745                 :     \a src points at the UTF-8, and \a srclen is the number of bytes to
     746                 :     convert.
     747                 : 
     748                 :     \a dst points at an array to write, and \a dstlen is the number of
     749                 :     locations in this array. At most \a dstlen-1 words will be
     750                 :     written there, plus a 0 terminating word. Thus this function
     751                 :     will never overwrite the buffer and will always return a
     752                 :     zero-terminated string. If \a dstlen is zero then \a dst can be
     753                 :     null and no data is written, but the length is returned.
     754                 : 
     755                 :     The return value is the number of words that \e would be written
     756                 :     to \a dst if it were long enough, not counting the terminating
     757                 :     zero. If the return value is greater or equal to \a dstlen it
     758                 :     indicates truncation, you can then allocate a new array of size
     759                 :     return+1 and call this again.
     760                 : 
     761                 :     Errors in the UTF-8 are converted as though each byte in the
     762                 :     erroneous string is in the Microsoft CP1252 encoding. This allows
     763                 :     ISO-8859-1 text mistakenly identified as UTF-8 to be printed
     764                 :     correctly.
     765                 : 
     766                 :     Notice that sizeof(wchar_t) is 2 on Windows and is 4 on Linux
     767                 :     and most other systems. Where wchar_t is 16 bits, Unicode
     768                 :     characters in the range 0x10000 to 0x10ffff are converted to
     769                 :     "surrogate pairs" which take two words each (this is called UTF-16
     770                 :     encoding). If wchar_t is 32 bits this rather nasty problem is
     771                 :     avoided.
     772                 : */
     773               0 : static unsigned utf8towc(const char* src, unsigned srclen,
     774                 :                          wchar_t* dst, unsigned dstlen)
     775                 : {
     776               0 :   const char* p = src;
     777               0 :   const char* e = src+srclen;
     778               0 :   unsigned count = 0;
     779               0 :   if (dstlen) for (;;) {
     780               0 :     if (p >= e) {dst[count] = 0; return count;}
     781               0 :     if (!(*p & 0x80)) { // ascii
     782               0 :       dst[count] = *p++;
     783                 :     } else {
     784               0 :       int len; unsigned ucs = utf8decode(p,e,&len);
     785               0 :       p += len;
     786                 : #ifdef _WIN32
     787                 :       if (ucs < 0x10000) {
     788                 :           dst[count] = (wchar_t)ucs;
     789                 :       } else {
     790                 :   // make a surrogate pair:
     791                 :   if (count+2 >= dstlen) {dst[count] = 0; count += 2; break;}
     792                 :         dst[count] = (wchar_t)((((ucs-0x10000u)>>10)&0x3ff) | 0xd800);
     793                 :         dst[++count] = (wchar_t)((ucs&0x3ff) | 0xdc00);
     794                 :       }
     795                 : #else
     796               0 :       dst[count] = (wchar_t)ucs;
     797                 : #endif
     798                 :     }
     799               0 :     if (++count == dstlen) {dst[count-1] = 0; break;}
     800                 :   }
     801                 :   // we filled dst, measure the rest:
     802               0 :   while (p < e) {
     803               0 :     if (!(*p & 0x80)) p++;
     804                 :     else {
     805                 : #ifdef _WIN32
     806                 :       int len; unsigned ucs = utf8decode(p,e,&len);
     807                 :       p += len;
     808                 :       if (ucs >= 0x10000) ++count;
     809                 : #else
     810               0 :       int len; utf8decode(p,e,&len);
     811               0 :       p += len;
     812                 : #endif
     813                 :     }
     814               0 :     ++count;
     815                 :   }
     816               0 :   return count;
     817                 : }
     818                 : 
     819                 : /************************************************************************/
     820                 : /*                              utf8toa()                               */
     821                 : /************************************************************************/
     822                 : /* Convert a UTF-8 sequence into an array of 1-byte characters.
     823                 : 
     824                 :     If the UTF-8 decodes to a character greater than 0xff then it is
     825                 :     replaced with '?'.
     826                 : 
     827                 :     Errors in the UTF-8 are converted as individual bytes, same as
     828                 :     utf8decode() does. This allows ISO-8859-1 text mistakenly identified
     829                 :     as UTF-8 to be printed correctly (and possibly CP1512 on Windows).
     830                 : 
     831                 :     \a src points at the UTF-8, and \a srclen is the number of bytes to
     832                 :     convert.
     833                 : 
     834                 :     Up to \a dstlen bytes are written to \a dst, including a null
     835                 :     terminator. The return value is the number of bytes that would be
     836                 :     written, not counting the null terminator. If greater or equal to
     837                 :     \a dstlen then if you malloc a new array of size n+1 you will have
     838                 :     the space needed for the entire string. If \a dstlen is zero then
     839                 :     nothing is written and this call just measures the storage space
     840                 :     needed.
     841                 : */
     842               0 : static unsigned utf8toa(const char* src, unsigned srclen,
     843                 :                         char* dst, unsigned dstlen)
     844                 : {
     845               0 :   const char* p = src;
     846               0 :   const char* e = src+srclen;
     847               0 :   unsigned count = 0;
     848               0 :   if (dstlen) for (;;) {
     849                 :     unsigned char c;
     850               0 :     if (p >= e) {dst[count] = 0; return count;}
     851               0 :     c = *(unsigned char*)p;
     852               0 :     if (c < 0xC2) { // ascii or bad code
     853               0 :       dst[count] = c;
     854               0 :       p++;
     855                 :     } else {
     856               0 :       int len; unsigned ucs = utf8decode(p,e,&len);
     857               0 :       p += len;
     858               0 :       if (ucs < 0x100) dst[count] = (char)ucs;
     859               0 :       else dst[count] = '?';
     860                 :     }
     861               0 :     if (++count >= dstlen) {dst[count-1] = 0; break;}
     862                 :   }
     863                 :   // we filled dst, measure the rest:
     864               0 :   while (p < e) {
     865               0 :     if (!(*p & 0x80)) p++;
     866                 :     else {
     867                 :       int len;
     868               0 :       utf8decode(p,e,&len);
     869               0 :       p += len;
     870                 :     }
     871               0 :     ++count;
     872                 :   }
     873               0 :   return count;
     874                 : }
     875                 : 
     876                 : /************************************************************************/
     877                 : /*                             utf8fromwc()                             */
     878                 : /************************************************************************/
     879                 : /* Turn "wide characters" as returned by some system calls
     880                 :     (especially on Windows) into UTF-8.
     881                 : 
     882                 :     Up to \a dstlen bytes are written to \a dst, including a null
     883                 :     terminator. The return value is the number of bytes that would be
     884                 :     written, not counting the null terminator. If greater or equal to
     885                 :     \a dstlen then if you malloc a new array of size n+1 you will have
     886                 :     the space needed for the entire string. If \a dstlen is zero then
     887                 :     nothing is written and this call just measures the storage space
     888                 :     needed.
     889                 : 
     890                 :     \a srclen is the number of words in \a src to convert. On Windows
     891                 :     this is not necessairly the number of characters, due to there
     892                 :     possibly being "surrogate pairs" in the UTF-16 encoding used.
     893                 :     On Unix wchar_t is 32 bits and each location is a character.
     894                 : 
     895                 :     On Unix if a src word is greater than 0x10ffff then this is an
     896                 :     illegal character according to RFC 3629. These are converted as
     897                 :     though they are 0xFFFD (REPLACEMENT CHARACTER). Characters in the
     898                 :     range 0xd800 to 0xdfff, or ending with 0xfffe or 0xffff are also
     899                 :     illegal according to RFC 3629. However I encode these as though
     900                 :     they are legal, so that utf8towc will return the original data.
     901                 : 
     902                 :     On Windows "surrogate pairs" are converted to a single character
     903                 :     and UTF-8 encoded (as 4 bytes). Mismatched halves of surrogate
     904                 :     pairs are converted as though they are individual characters.
     905                 : */
     906               0 : static unsigned utf8fromwc(char* dst, unsigned dstlen,
     907                 :                            const wchar_t* src, unsigned srclen) {
     908               0 :   unsigned i = 0;
     909               0 :   unsigned count = 0;
     910               0 :   if (dstlen) for (;;) {
     911                 :     unsigned ucs;
     912               0 :     if (i >= srclen) {dst[count] = 0; return count;}
     913               0 :     ucs = src[i++];
     914               0 :     if (ucs < 0x80U) {
     915               0 :       dst[count++] = (char)ucs;
     916               0 :       if (count >= dstlen) {dst[count-1] = 0; break;}
     917               0 :     } else if (ucs < 0x800U) { // 2 bytes
     918               0 :       if (count+2 >= dstlen) {dst[count] = 0; count += 2; break;}
     919               0 :       dst[count++] = 0xc0 | (char)(ucs >> 6);
     920               0 :       dst[count++] = 0x80 | (char)(ucs & 0x3F);
     921                 : #ifdef _WIN32
     922                 :     } else if (ucs >= 0xd800 && ucs <= 0xdbff && i < srclen &&
     923                 :          src[i] >= 0xdc00 && src[i] <= 0xdfff) {
     924                 :       // surrogate pair
     925                 :       unsigned ucs2 = src[i++];
     926                 :       ucs = 0x10000U + ((ucs&0x3ff)<<10) + (ucs2&0x3ff);
     927                 :       // all surrogate pairs turn into 4-byte utf8
     928                 : #else
     929               0 :     } else if (ucs >= 0x10000) {
     930               0 :       if (ucs > 0x10ffff) {
     931               0 :   ucs = 0xfffd;
     932               0 :   goto J1;
     933                 :       }
     934                 : #endif
     935               0 :       if (count+4 >= dstlen) {dst[count] = 0; count += 4; break;}
     936               0 :       dst[count++] = 0xf0 | (char)(ucs >> 18);
     937               0 :       dst[count++] = 0x80 | (char)((ucs >> 12) & 0x3F);
     938               0 :       dst[count++] = 0x80 | (char)((ucs >> 6) & 0x3F);
     939               0 :       dst[count++] = 0x80 | (char)(ucs & 0x3F);
     940                 :     } else {
     941                 : #ifndef _WIN32
     942                 :     J1:
     943                 : #endif
     944                 :       // all others are 3 bytes:
     945               0 :       if (count+3 >= dstlen) {dst[count] = 0; count += 3; break;}
     946               0 :       dst[count++] = 0xe0 | (char)(ucs >> 12);
     947               0 :       dst[count++] = 0x80 | (char)((ucs >> 6) & 0x3F);
     948               0 :       dst[count++] = 0x80 | (char)(ucs & 0x3F);
     949                 :     }
     950                 :   }
     951                 :   // we filled dst, measure the rest:
     952               0 :   while (i < srclen) {
     953               0 :     unsigned ucs = src[i++];
     954               0 :     if (ucs < 0x80U) {
     955               0 :       count++;
     956               0 :     } else if (ucs < 0x800U) { // 2 bytes
     957               0 :       count += 2;
     958                 : #ifdef _WIN32
     959                 :     } else if (ucs >= 0xd800 && ucs <= 0xdbff && i < srclen-1 &&
     960                 :          src[i+1] >= 0xdc00 && src[i+1] <= 0xdfff) {
     961                 :       // surrogate pair
     962                 :       ++i;
     963                 : #else
     964               0 :     } else if (ucs >= 0x10000 && ucs <= 0x10ffff) {
     965                 : #endif
     966               0 :       count += 4;
     967                 :     } else {
     968               0 :       count += 3;
     969                 :     }
     970                 :   }
     971               0 :   return count;
     972                 : }
     973                 : 
     974                 : 
     975                 : /************************************************************************/
     976                 : /*                             utf8froma()                              */
     977                 : /************************************************************************/
     978                 : 
     979                 : /* Convert an ISO-8859-1 (ie normal c-string) byte stream to UTF-8.
     980                 : 
     981                 :     It is possible this should convert Microsoft's CP1252 to UTF-8
     982                 :     instead. This would translate the codes in the range 0x80-0x9f
     983                 :     to different characters. Currently it does not do this.
     984                 : 
     985                 :     Up to \a dstlen bytes are written to \a dst, including a null
     986                 :     terminator. The return value is the number of bytes that would be
     987                 :     written, not counting the null terminator. If greater or equal to
     988                 :     \a dstlen then if you malloc a new array of size n+1 you will have
     989                 :     the space needed for the entire string. If \a dstlen is zero then
     990                 :     nothing is written and this call just measures the storage space
     991                 :     needed.
     992                 : 
     993                 :     \a srclen is the number of bytes in \a src to convert.
     994                 : 
     995                 :     If the return value equals \a srclen then this indicates that
     996                 :     no conversion is necessary, as only ASCII characters are in the
     997                 :     string.
     998                 : */
     999               0 : static unsigned utf8froma(char* dst, unsigned dstlen,
    1000                 :                           const char* src, unsigned srclen) {
    1001               0 :   const char* p = src;
    1002               0 :   const char* e = src+srclen;
    1003               0 :   unsigned count = 0;
    1004               0 :   if (dstlen) for (;;) {
    1005                 :     unsigned char ucs;
    1006               0 :     if (p >= e) {dst[count] = 0; return count;}
    1007               0 :     ucs = *(unsigned char*)p++;
    1008               0 :     if (ucs < 0x80U) {
    1009               0 :       dst[count++] = ucs;
    1010               0 :       if (count >= dstlen) {dst[count-1] = 0; break;}
    1011                 :     } else { // 2 bytes (note that CP1252 translate could make 3 bytes!)
    1012               0 :       if (count+2 >= dstlen) {dst[count] = 0; count += 2; break;}
    1013               0 :       dst[count++] = 0xc0 | (ucs >> 6);
    1014               0 :       dst[count++] = 0x80 | (ucs & 0x3F);
    1015                 :     }
    1016                 :   }
    1017                 :   // we filled dst, measure the rest:
    1018               0 :   while (p < e) {
    1019               0 :     unsigned char ucs = *(unsigned char*)p++;
    1020               0 :     if (ucs < 0x80U) {
    1021               0 :       count++;
    1022                 :     } else {
    1023               0 :       count += 2;
    1024                 :     }
    1025                 :   }
    1026               0 :   return count;
    1027                 : }
    1028                 : 
    1029                 : /*
    1030                 : ** For now we disable the rest which is locale() related.  We may need 
    1031                 : ** parts of it later. 
    1032                 : */
    1033                 : 
    1034                 : #ifdef notdef 
    1035                 : 
    1036                 : #ifdef _WIN32
    1037                 : # include <windows.h>
    1038                 : #endif
    1039                 : 
    1040                 : /*! Return true if the "locale" seems to indicate that UTF-8 encoding
    1041                 :     is used. If true the utf8tomb and utf8frommb don't do anything
    1042                 :     useful.
    1043                 : 
    1044                 :     <i>It is highly recommended that you change your system so this
    1045                 :     does return true.</i> On Windows this is done by setting the
    1046                 :     "codepage" to CP_UTF8.  On Unix this is done by setting $LC_CTYPE
    1047                 :     to a string containing the letters "utf" or "UTF" in it, or by
    1048                 :     deleting all $LC* and $LANG environment variables. In the future
    1049                 :     it is likely that all non-Asian Unix systems will return true,
    1050                 :     due to the compatability of UTF-8 with ISO-8859-1.
    1051                 : */
    1052                 : int utf8locale(void) {
    1053                 :   static int ret = 2;
    1054                 :   if (ret == 2) {
    1055                 : #ifdef _WIN32
    1056                 :     ret = GetACP() == CP_UTF8;
    1057                 : #else
    1058                 :     char* s;
    1059                 :     ret = 1; // assumme UTF-8 if no locale
    1060                 :     if (((s = getenv("LC_CTYPE")) && *s) ||
    1061                 :   ((s = getenv("LC_ALL"))   && *s) ||
    1062                 :   ((s = getenv("LANG"))     && *s)) {
    1063                 :       ret = (strstr(s,"utf") || strstr(s,"UTF"));
    1064                 :     }
    1065                 : #endif
    1066                 :   }
    1067                 :   return ret;
    1068                 : }
    1069                 : 
    1070                 : /*! Convert the UTF-8 used by FLTK to the locale-specific encoding
    1071                 :     used for filenames (and sometimes used for data in files).
    1072                 :     Unfortunatley due to stupid design you will have to do this as
    1073                 :     needed for filenames. This is a bug on both Unix and Windows.
    1074                 : 
    1075                 :     Up to \a dstlen bytes are written to \a dst, including a null
    1076                 :     terminator. The return value is the number of bytes that would be
    1077                 :     written, not counting the null terminator. If greater or equal to
    1078                 :     \a dstlen then if you malloc a new array of size n+1 you will have
    1079                 :     the space needed for the entire string. If \a dstlen is zero then
    1080                 :     nothing is written and this call just measures the storage space
    1081                 :     needed.
    1082                 : 
    1083                 :     If utf8locale() returns true then this does not change the data.
    1084                 :     It is copied and truncated as necessary to
    1085                 :     the destination buffer and \a srclen is always returned.  */
    1086                 : unsigned utf8tomb(const char* src, unsigned srclen,
    1087                 :       char* dst, unsigned dstlen)
    1088                 : {
    1089                 :   if (!utf8locale()) {
    1090                 : #ifdef _WIN32
    1091                 :     wchar_t lbuf[1024];
    1092                 :     wchar_t* buf = lbuf;
    1093                 :     unsigned length = utf8towc(src, srclen, buf, 1024);
    1094                 :     unsigned ret;
    1095                 :     if (length >= 1024) {
    1096                 :       buf = (wchar_t*)(malloc((length+1)*sizeof(wchar_t)));
    1097                 :       utf8towc(src, srclen, buf, length+1);
    1098                 :     }
    1099                 :     if (dstlen) {
    1100                 :       // apparently this does not null-terminate, even though msdn
    1101                 :       // documentation claims it does:
    1102                 :       ret =
    1103                 :         WideCharToMultiByte(GetACP(), 0, buf, length, dst, dstlen, 0, 0);
    1104                 :       dst[ret] = 0;
    1105                 :     }
    1106                 :     // if it overflows or measuring length, get the actual length:
    1107                 :     if (dstlen==0 || ret >= dstlen-1)
    1108                 :       ret =
    1109                 :   WideCharToMultiByte(GetACP(), 0, buf, length, 0, 0, 0, 0);
    1110                 :     if (buf != lbuf) free((void*)buf);
    1111                 :     return ret;
    1112                 : #else
    1113                 :     wchar_t lbuf[1024];
    1114                 :     wchar_t* buf = lbuf;
    1115                 :     unsigned length = utf8towc(src, srclen, buf, 1024);
    1116                 :     int ret;
    1117                 :     if (length >= 1024) {
    1118                 :       buf = (wchar_t*)(malloc((length+1)*sizeof(wchar_t)));
    1119                 :       utf8towc(src, srclen, buf, length+1);
    1120                 :     }
    1121                 :     if (dstlen) {
    1122                 :       ret = wcstombs(dst, buf, dstlen);
    1123                 :       if (ret >= dstlen-1) ret = wcstombs(0,buf,0);
    1124                 :     } else {
    1125                 :       ret = wcstombs(0,buf,0);
    1126                 :     }
    1127                 :     if (buf != lbuf) free((void*)buf);
    1128                 :     if (ret >= 0) return (unsigned)ret;
    1129                 :     // on any errors we return the UTF-8 as raw text...
    1130                 : #endif
    1131                 :   }
    1132                 :   // identity transform:
    1133                 :   if (srclen < dstlen) {
    1134                 :     memcpy(dst, src, srclen);
    1135                 :     dst[srclen] = 0;
    1136                 :   } else {
    1137                 :     memcpy(dst, src, dstlen-1);
    1138                 :     dst[dstlen-1] = 0;
    1139                 :   }
    1140                 :   return srclen;
    1141                 : }
    1142                 : 
    1143                 : /*! Convert a filename from the locale-specific multibyte encoding
    1144                 :     used by Windows to UTF-8 as used by FLTK.
    1145                 : 
    1146                 :     Up to \a dstlen bytes are written to \a dst, including a null
    1147                 :     terminator. The return value is the number of bytes that would be
    1148                 :     written, not counting the null terminator. If greater or equal to
    1149                 :     \a dstlen then if you malloc a new array of size n+1 you will have
    1150                 :     the space needed for the entire string. If \a dstlen is zero then
    1151                 :     nothing is written and this call just measures the storage space
    1152                 :     needed.
    1153                 : 
    1154                 :     On Unix or on Windows when a UTF-8 locale is in effect, this
    1155                 :     does not change the data. It is copied and truncated as necessary to
    1156                 :     the destination buffer and \a srclen is always returned.
    1157                 :     You may also want to check if utf8test() returns non-zero, so that
    1158                 :     the filesystem can store filenames in UTF-8 encoding regardless of
    1159                 :     the locale.
    1160                 : */
    1161                 : unsigned utf8frommb(char* dst, unsigned dstlen,
    1162                 :         const char* src, unsigned srclen)
    1163                 : {
    1164                 :   if (!utf8locale()) {
    1165                 : #ifdef _WIN32
    1166                 :     wchar_t lbuf[1024];
    1167                 :     wchar_t* buf = lbuf;
    1168                 :     unsigned length;
    1169                 :     unsigned ret;
    1170                 :     length =
    1171                 :       MultiByteToWideChar(GetACP(), 0, src, srclen, buf, 1024);
    1172                 :     if (length >= 1024) {
    1173                 :       length = MultiByteToWideChar(GetACP(), 0, src, srclen, 0, 0);
    1174                 :       buf = (wchar_t*)(malloc(length*sizeof(wchar_t)));
    1175                 :       MultiByteToWideChar(GetACP(), 0, src, srclen, buf, length);
    1176                 :     }
    1177                 :     ret = utf8fromwc(dst, dstlen, buf, length);
    1178                 :     if (buf != lbuf) free((void*)buf);
    1179                 :     return ret;
    1180                 : #else
    1181                 :     wchar_t lbuf[1024];
    1182                 :     wchar_t* buf = lbuf;
    1183                 :     int length;
    1184                 :     unsigned ret;
    1185                 :     length = mbstowcs(buf, src, 1024);
    1186                 :     if (length >= 1024) {
    1187                 :       length = mbstowcs(0, src, 0)+1;
    1188                 :       buf = (wchar_t*)(malloc(length*sizeof(unsigned short)));
    1189                 :       mbstowcs(buf, src, length);
    1190                 :     }
    1191                 :     if (length >= 0) {
    1192                 :       ret = utf8fromwc(dst, dstlen, buf, length);
    1193                 :       if (buf != lbuf) free((void*)buf);
    1194                 :       return ret;
    1195                 :     }
    1196                 :     // errors in conversion return the UTF-8 unchanged
    1197                 : #endif
    1198                 :   }
    1199                 :   // identity transform:
    1200                 :   if (srclen < dstlen) {
    1201                 :     memcpy(dst, src, srclen);
    1202                 :     dst[srclen] = 0;
    1203                 :   } else {
    1204                 :     memcpy(dst, src, dstlen-1);
    1205                 :     dst[dstlen-1] = 0;
    1206                 :   }
    1207                 :   return srclen;
    1208                 : }
    1209                 : 
    1210                 : #endif /* def notdef - disabled locale specific stuff */
    1211                 : 
    1212                 : /*! Examines the first \a srclen bytes in \a src and return a verdict
    1213                 :     on whether it is UTF-8 or not.
    1214                 :     - Returns 0 if there is any illegal UTF-8 sequences, using the
    1215                 :       same rules as utf8decode(). Note that some UCS values considered
    1216                 :       illegal by RFC 3629, such as 0xffff, are considered legal by this.
    1217                 :     - Returns 1 if there are only single-byte characters (ie no bytes
    1218                 :       have the high bit set). This is legal UTF-8, but also indicates
    1219                 :       plain ASCII. It also returns 1 if \a srclen is zero.
    1220                 :     - Returns 2 if there are only characters less than 0x800.
    1221                 :     - Returns 3 if there are only characters less than 0x10000.
    1222                 :     - Returns 4 if there are characters in the 0x10000 to 0x10ffff range.
    1223                 : 
    1224                 :     Because there are many illegal sequences in UTF-8, it is almost
    1225                 :     impossible for a string in another encoding to be confused with
    1226                 :     UTF-8. This is very useful for transitioning Unix to UTF-8
    1227                 :     filenames, you can simply test each filename with this to decide
    1228                 :     if it is UTF-8 or in the locale encoding. My hope is that if
    1229                 :     this is done we will be able to cleanly transition to a locale-less
    1230                 :     encoding.
    1231                 : */
    1232                 : 
    1233              94 : static int utf8test(const char* src, unsigned srclen) {
    1234              94 :   int ret = 1;
    1235              94 :   const char* p = src;
    1236              94 :   const char* e = src+srclen;
    1237            1195 :   while (p < e) {
    1238            1008 :     if (*p & 0x80) {
    1239               1 :       int len; utf8decode(p,e,&len);
    1240               1 :       if (len < 2) return 0;
    1241               0 :       if (len > ret) ret = len;
    1242               0 :       p += len;
    1243                 :     } else {
    1244            1007 :       p++;
    1245                 :     }
    1246                 :   }
    1247              93 :   return ret;
    1248                 : }
    1249                 : 
    1250                 : #endif /* defined(CPL_RECODE_STUB) */

Generated by: LCOV version 1.7