<?php 
/** 
 * UTF8 - very fast UTF-8 converter implementation for PHP 
 * 
 * It converts between UTF-8 string and codepoints very fast 
 * Suitable for Standard Compression Scheme for Unicode <http://www.unicode.org/reports/tr6/> 
 * Based on PHP source of UTF8 functions by Henri Sivonen <[email protected]> <http://iki.fi/hsivonen/php-utf8/> 
 * 
 * @author     Alexey A.Znaev <[email protected]> <http://xbsoft.org> 
 * @copyright     Copyright (C) 2011-2012 Alexey A.Znaev 
 * @license     http://www.gnu.org/licenses GNU Public License version 3 
 * @link     http://xbsoft.org 
 * @package     UTF8 
 * @version     1.0 
 */ 
 
// ----------------------------------------------------------------------------- 
 
/** 
 * Provides methods for very fast UTF-8 convertion 
 * 
 * See UTF8.php File description for full information 
 * 
 * @author     Alexey A.Znaev <[email protected]> <http://xbsoft.org> 
 * @link     http://xbsoft.org 
 * @package     UTF8 
 * @version     1.0 
 * @since     1.0 
 */ 
class UTF8 { 
    /** 
     * Converts UTF-8 string to array of Unicode codepoints 
     * 
     * @param         &string $str Reference to UTF-8 string 
     * @param         &mixed[] $out Reference to arrray to store the result 
     * @param         int $start Staring offset in string 
     * @param         int $maxCP Maximal array index, PHP_INT_MAX when omitted 
     * @return         int Returns offset of byte next to last converted in string 
     * @throws         UTF8_Exception 
     */ 
    public function strToCodepoints(&$str, &$out, $start, $maxCP = PHP_INT_MAX){ 
      $mState = 0; 
      $mUcs4  = 0;     
      $mBytes = 1; 
      $count = 0; 
      $len = strlen($str); 
      for($i = $start; ($count < $maxCP) && ($i < $len); $i++) { 
        $in = ord($str{$i}); 
        if (0 == $mState) { 
          if (0 == (0x80 & ($in))) { 
            $out[] = $in; 
            $mBytes = 1; 
        $count++; 
          } else if (0xC0 == (0xE0 & ($in))) { 
            $mUcs4 = ($in); 
            $mUcs4 = ($mUcs4 & 0x1F) << 6; 
            $mState = 1; 
            $mBytes = 2; 
          } else if (0xE0 == (0xF0 & ($in))) { 
            $mUcs4 = ($in); 
            $mUcs4 = ($mUcs4 & 0x0F) << 12; 
            $mState = 2; 
            $mBytes = 3; 
          } else if (0xF0 == (0xF8 & ($in))) { 
            $mUcs4 = ($in); 
            $mUcs4 = ($mUcs4 & 0x07) << 18; 
            $mState = 3; 
            $mBytes = 4; 
          } else if (0xF8 == (0xFC & ($in))) { 
            $mUcs4 = ($in); 
            $mUcs4 = ($mUcs4 & 0x03) << 24; 
            $mState = 4; 
            $mBytes = 5; 
          } else if (0xFC == (0xFE & ($in))) { 
            $mUcs4 = ($in); 
            $mUcs4 = ($mUcs4 & 1) << 30; 
            $mState = 5; 
            $mBytes = 6; 
          } else throw new UTF8_Exception('Octet #' . $i . '.', UTF8_Exception::INPUT_OCT_RANGE); 
        } else { 
          if (0x80 == (0xC0 & ($in))) { 
            $shift = ($mState - 1) * 6; 
            $tmp = $in; 
            $tmp = ($tmp & 0x0000003F) << $shift; 
            $mUcs4 |= $tmp; 
            if (0 == --$mState) { 
              if (((2 == $mBytes) && ($mUcs4 < 0x0080)) || 
                  ((3 == $mBytes) && ($mUcs4 < 0x0800)) || 
                  ((4 == $mBytes) && ($mUcs4 < 0x10000)) || 
                  (4 < $mBytes) || 
                  (($mUcs4 & 0xFFFFF800) == 0xD800) || 
                  ($mUcs4 > 0x10FFFF)) throw new UTF8_Exception('Octet #' . $i . '.', UTF8_Exception::INPUT_OCT_BAD); 
              if (0xFEFF != $mUcs4){ 
        $out[] = $mUcs4; 
        $count++; 
          } 
              $mState = 0; 
              $mUcs4  = 0; 
              $mBytes = 1; 
            } 
          } else throw new UTF8_Exception('Octet #' . $i . '.', UTF8_Exception::INPUT_OCT_INCOMPL); 
        } 
      } 
      return $i; 
    } 
 
    /** 
     * Converts single Unicode codepoint to UTF-8 string 
     * 
     * @param         int $cp The Unicode codepoint value 
     * @return         string Returns UTF-8 string containing bytes representing codepoint 
     * @throws         UTF8_Exception 
     */ 
    public function codepointToStr($cp){ 
    $res = ''; 
        if($cp < 0) { 
          throw new UTF8_Exception('Codepoint #' . $i . '.', UTF8_Exception::OUTPUT_CP_NEG); 
        } else if ( $cp <= 0x007f) { 
          $res .= chr($cp); 
        } else if ($cp <= 0x07ff) { 
          $res .= chr(0xc0 | ($cp >> 6)); 
          $res .= chr(0x80 | ($cp & 0x003f)); 
        } else if($cp == 0xFEFF) { 
        } else if ($cp >= 0xD800 && $cp <= 0xDFFF) { 
          throw new UTF8_Exception('Codepoint #' . $i . '.', UTF8_Exception::OUTPUT_CP_SUR); 
        } else if ($cp <= 0xffff) { 
          $res .= chr(0xe0 | ($cp >> 12)); 
          $res .= chr(0x80 | (($cp >> 6) & 0x003f)); 
          $res .= chr(0x80 | ($cp & 0x003f)); 
        } else if ($cp <= 0x10ffff) { 
          $res .= chr(0xf0 | ($cp >> 18)); 
          $res .= chr(0x80 | (($cp >> 12) & 0x3f)); 
          $res .= chr(0x80 | (($cp >> 6) & 0x3f)); 
          $res .= chr(0x80 | ($cp & 0x3f)); 
        } else throw new UTF8_Exception('Codepoint #' . $i . '.', UTF8_Exception::OUTPUT_CP_RANGE); 
    return $res; 
    } 
} 
 
/** 
 * Provides exceptions of UTF-8 converting errors 
 * 
 * See UTF8.php File description for full information 
 * 
 * @author     Alexey A.Znaev <[email protected]> <http://xbsoft.org> 
 * @link     http://xbsoft.org 
 * @package     UTF8 
 * @version     1.0 
 * @since     1.0 
 */ 
class UTF8_Exception extends Exception { 
    const INTERNAL         = 0x00; 
    const INPUT         = 0x10; 
    const INPUT_OCT_RANGE    = 0x11; 
    const INPUT_OCT_BAD     = 0x12; 
    const INPUT_OCT_INCOMPL    = 0x13; 
    const OUTPUT         = 0x20; 
    const OUTPUT_CP_NEG        = 0x21; 
    const OUTPUT_CP_SUR        = 0x22; 
    const OUTPUT_CP_RANGE    = 0x23; 
 
    private static $Messages = array( 
    self::INTERNAL         => 'Internal error.', 
    self::INPUT         => 'Illegal input.', 
        self::INPUT_OCT_RANGE     => 'Octet is neither in the US-ASCII range nor a legal first octet of a multi-octet sequence.', 
        self::INPUT_OCT_BAD     => 'Illegal non-shortest form or surrogate character or codepoint outside the Unicode range.', 
        self::INPUT_OCT_INCOMPL    => 'Incomplete multi-octet sequence.', 
    self::OUTPUT         => 'Bad output.', 
        self::OUTPUT_CP_NEG     => 'Negative value.', 
        self::OUTPUT_CP_SUR     => 'Surrogate value.', 
        self::OUTPUT_CP_RANGE     => 'Out of range.' 
    ); 
 
    public function __construct($message = '', $code = 0x00, Exception $previous = null) { 
    $message_prefix = ''; 
    $code_class = $code & 0xF0; 
    if(array_key_exists($code_class, self::$Messages)) $message_prefix = self::$Messages[$code_class]; 
    if(($code != $code_class) && array_key_exists($code, self::$Messages)) $message_prefix .= ' ' . self::$Messages[$code]; 
    if(!empty($message_prefix)) $message = $message_prefix . ' ' . $message; 
        parent::__construct('UTF8: ' . $message, $code, $previous); 
    } 
} 
 
?> 
 
 |