File: UTF8.php

Recommend this page to a friend!
???
File:	`???`
Role:	Class source
Content typex:	`text/plain`
Description:	UTF8 - helper class
Class:	SCSU Encode and decode Unicode strings with SCSU
Author:	By Alexey Znaev
Last change:
Date:	13 years ago
Size:	`6,725 bytes`
Download

<?php

/**

 * UTF8 - very fast UTF-8 converter implementation for PHP

 *

 * It converts between UTF-8 string and codepoints very fast

 * Suitable for Standard Compression Scheme for Unicode <http://www.unicode.org/reports/tr6/>

 * Based on PHP source of UTF8 functions by Henri Sivonen <[email protected]> <http://iki.fi/hsivonen/php-utf8/>

 *

 * @author     Alexey A.Znaev <[email protected]> <http://xbsoft.org>

 * @copyright     Copyright (C) 2011-2012 Alexey A.Znaev

 * @license     http://www.gnu.org/licenses GNU Public License version 3

 * @link     http://xbsoft.org

 * @package     UTF8

 * @version     1.0

 */



// -----------------------------------------------------------------------------



/**

 * Provides methods for very fast UTF-8 convertion

 *

 * See UTF8.php File description for full information

 *

 * @author     Alexey A.Znaev <[email protected]> <http://xbsoft.org>

 * @link     http://xbsoft.org

 * @package     UTF8

 * @version     1.0

 * @since     1.0

 */

class UTF8 {

    /**

     * Converts UTF-8 string to array of Unicode codepoints

     *

     * @param         &string $str Reference to UTF-8 string

     * @param         &mixed[] $out Reference to arrray to store the result

     * @param         int $start Staring offset in string

     * @param         int $maxCP Maximal array index, PHP_INT_MAX when omitted

     * @return         int Returns offset of byte next to last converted in string

     * @throws         UTF8_Exception

     */

    public function strToCodepoints(&$str, &$out, $start, $maxCP = PHP_INT_MAX){

      $mState = 0;

      $mUcs4  = 0;    

      $mBytes = 1;

      $count = 0;

      $len = strlen($str);

      for($i = $start; ($count < $maxCP) && ($i < $len); $i++) {

        $in = ord($str{$i});

        if (0 == $mState) {

          if (0 == (0x80 & ($in))) {

            $out[] = $in;

            $mBytes = 1;

        $count++;

          } else if (0xC0 == (0xE0 & ($in))) {

            $mUcs4 = ($in);

            $mUcs4 = ($mUcs4 & 0x1F) << 6;

            $mState = 1;

            $mBytes = 2;

          } else if (0xE0 == (0xF0 & ($in))) {

            $mUcs4 = ($in);

            $mUcs4 = ($mUcs4 & 0x0F) << 12;

            $mState = 2;

            $mBytes = 3;

          } else if (0xF0 == (0xF8 & ($in))) {

            $mUcs4 = ($in);

            $mUcs4 = ($mUcs4 & 0x07) << 18;

            $mState = 3;

            $mBytes = 4;

          } else if (0xF8 == (0xFC & ($in))) {

            $mUcs4 = ($in);

            $mUcs4 = ($mUcs4 & 0x03) << 24;

            $mState = 4;

            $mBytes = 5;

          } else if (0xFC == (0xFE & ($in))) {

            $mUcs4 = ($in);

            $mUcs4 = ($mUcs4 & 1) << 30;

            $mState = 5;

            $mBytes = 6;

          } else throw new UTF8_Exception('Octet #' . $i . '.', UTF8_Exception::INPUT_OCT_RANGE);

        } else {

          if (0x80 == (0xC0 & ($in))) {

            $shift = ($mState - 1) * 6;

            $tmp = $in;

            $tmp = ($tmp & 0x0000003F) << $shift;

            $mUcs4 |= $tmp;

            if (0 == --$mState) {

              if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||

                  ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||

                  ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||

                  (4 < $mBytes) ||

                  (($mUcs4 & 0xFFFFF800) == 0xD800) ||

                  ($mUcs4 > 0x10FFFF)) throw new UTF8_Exception('Octet #' . $i . '.', UTF8_Exception::INPUT_OCT_BAD);

              if (0xFEFF != $mUcs4){

        $out[] = $mUcs4;

        $count++;

          }

              $mState = 0;

              $mUcs4  = 0;

              $mBytes = 1;

            }

          } else throw new UTF8_Exception('Octet #' . $i . '.', UTF8_Exception::INPUT_OCT_INCOMPL);

        }

      }

      return $i;

    }



    /**

     * Converts single Unicode codepoint to UTF-8 string

     *

     * @param         int $cp The Unicode codepoint value

     * @return         string Returns UTF-8 string containing bytes representing codepoint

     * @throws         UTF8_Exception

     */

    public function codepointToStr($cp){

    $res = '';

        if($cp < 0) {

          throw new UTF8_Exception('Codepoint #' . $i . '.', UTF8_Exception::OUTPUT_CP_NEG);

        } else if ( $cp <= 0x007f) {

          $res .= chr($cp);

        } else if ($cp <= 0x07ff) {

          $res .= chr(0xc0 | ($cp >> 6));

          $res .= chr(0x80 | ($cp & 0x003f));

        } else if($cp == 0xFEFF) {

        } else if ($cp >= 0xD800 && $cp <= 0xDFFF) {

          throw new UTF8_Exception('Codepoint #' . $i . '.', UTF8_Exception::OUTPUT_CP_SUR);

        } else if ($cp <= 0xffff) {

          $res .= chr(0xe0 | ($cp >> 12));

          $res .= chr(0x80 | (($cp >> 6) & 0x003f));

          $res .= chr(0x80 | ($cp & 0x003f));

        } else if ($cp <= 0x10ffff) {

          $res .= chr(0xf0 | ($cp >> 18));

          $res .= chr(0x80 | (($cp >> 12) & 0x3f));

          $res .= chr(0x80 | (($cp >> 6) & 0x3f));

          $res .= chr(0x80 | ($cp & 0x3f));

        } else throw new UTF8_Exception('Codepoint #' . $i . '.', UTF8_Exception::OUTPUT_CP_RANGE);

    return $res;

    }

}



/**

 * Provides exceptions of UTF-8 converting errors

 *

 * See UTF8.php File description for full information

 *

 * @author     Alexey A.Znaev <[email protected]> <http://xbsoft.org>

 * @link     http://xbsoft.org

 * @package     UTF8

 * @version     1.0

 * @since     1.0

 */

class UTF8_Exception extends Exception {

    const INTERNAL         = 0x00;

    const INPUT         = 0x10;

    const INPUT_OCT_RANGE    = 0x11;

    const INPUT_OCT_BAD     = 0x12;

    const INPUT_OCT_INCOMPL    = 0x13;

    const OUTPUT         = 0x20;

    const OUTPUT_CP_NEG        = 0x21;

    const OUTPUT_CP_SUR        = 0x22;

    const OUTPUT_CP_RANGE    = 0x23;



    private static $Messages = array(

    self::INTERNAL         => 'Internal error.',

    self::INPUT         => 'Illegal input.',

        self::INPUT_OCT_RANGE     => 'Octet is neither in the US-ASCII range nor a legal first octet of a multi-octet sequence.',

        self::INPUT_OCT_BAD     => 'Illegal non-shortest form or surrogate character or codepoint outside the Unicode range.',

        self::INPUT_OCT_INCOMPL    => 'Incomplete multi-octet sequence.',

    self::OUTPUT         => 'Bad output.',

        self::OUTPUT_CP_NEG     => 'Negative value.',

        self::OUTPUT_CP_SUR     => 'Surrogate value.',

        self::OUTPUT_CP_RANGE     => 'Out of range.'

    );



    public function __construct($message = '', $code = 0x00, Exception $previous = null) {

    $message_prefix = '';

    $code_class = $code & 0xF0;

    if(array_key_exists($code_class, self::$Messages)) $message_prefix = self::$Messages[$code_class];

    if(($code != $code_class) && array_key_exists($code, self::$Messages)) $message_prefix .= ' ' . self::$Messages[$code];

    if(!empty($message_prefix)) $message = $message_prefix . ' ' . $message;

        parent::__construct('UTF8: ' . $message, $code, $previous);

    }

}



?>
About us
Advertise on this site
For more information send a message to info at phpclasses dot org.
File: UTF8.php

Contents