libraries/phputf8/utils/bad.php

<?php
/**
* @version $Id$
* Tools for locating / replacing bad bytes in UTF-8 strings
* The Original Code is Mozilla Communicator client code.
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 1998
* the Initial Developer. All Rights Reserved.
* Ported to PHP by Henri Sivonen (http://hsivonen.iki.fi)
* Slight modifications to fit with phputf8 library by Harry Fuecks (hfuecks gmail com)
* @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUTF8ToUnicode.cpp
* @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUnicodeToUTF8.cpp
* @see http://hsivonen.iki.fi/php-utf8/
* @package utf8
* @subpackage bad
* @see utf8_is_valid
*/

//--------------------------------------------------------------------
/**
* Locates the first bad byte in a UTF-8 string returning it's
* byte index in the string
* PCRE Pattern to locate bad bytes in a UTF-8 string
* Comes from W3 FAQ: Multilingual Forms
* Note: modified to include full ASCII range including control chars
* @see http://www.w3.org/International/questions/qa-forms-utf-8
* @param string
* @return mixed integer byte index or FALSE if no bad found
* @package utf8
* @subpackage bad
*/
function utf8_bad_find($str) {
    $UTF8_BAD =
    '([\x00-\x7F]'.                          # ASCII (including control chars)
    '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
    '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
    '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
    '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
    '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
    '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
    '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
    '|(.{1}))';                              # invalid byte
    $pos = 0;
    $badList = array();
    while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
        $bytes = strlen($matches[0]);
        if ( isset($matches[2])) {
            return $pos;
        }
        $pos += $bytes;
        $str = substr($str,$bytes);
    }
    return FALSE;
}

//--------------------------------------------------------------------
/**
* Locates all bad bytes in a UTF-8 string and returns a list of their
* byte index in the string
* PCRE Pattern to locate bad bytes in a UTF-8 string
* Comes from W3 FAQ: Multilingual Forms
* Note: modified to include full ASCII range including control chars
* @see http://www.w3.org/International/questions/qa-forms-utf-8
* @param string
* @return mixed array of integers or FALSE if no bad found
* @package utf8
* @subpackage bad
*/
function utf8_bad_findall($str) {
    $UTF8_BAD =
    '([\x00-\x7F]'.                          # ASCII (including control chars)
    '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
    '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
    '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
    '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
    '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
    '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
    '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
    '|(.{1}))';                              # invalid byte
    $pos = 0;
    $badList = array();
    while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
        $bytes = strlen($matches[0]);
        if ( isset($matches[2])) {
            $badList[] = $pos;
        }
        $pos += $bytes;
        $str = substr($str,$bytes);
    }
    if ( count($badList) > 0 ) {
        return $badList;
    }
    return FALSE;
}

//--------------------------------------------------------------------
/**
* Strips out any bad bytes from a UTF-8 string and returns the rest
* PCRE Pattern to locate bad bytes in a UTF-8 string
* Comes from W3 FAQ: Multilingual Forms
* Note: modified to include full ASCII range including control chars
* @see http://www.w3.org/International/questions/qa-forms-utf-8
* @param string
* @return string
* @package utf8
* @subpackage bad
*/
function utf8_bad_strip($str) {
    $UTF8_BAD =
    '([\x00-\x7F]'.                          # ASCII (including control chars)
    '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
    '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
    '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
    '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
    '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
    '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
    '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
    '|(.{1}))';                              # invalid byte
    ob_start();
    while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
        if ( !isset($matches[2])) {
            echo $matches[0];
        }
        $str = substr($str,strlen($matches[0]));
    }
    $result = ob_get_contents();
    ob_end_clean();
    return $result;
}

//--------------------------------------------------------------------
/**
* Replace bad bytes with an alternative character - ASCII character
* recommended is replacement char
* PCRE Pattern to locate bad bytes in a UTF-8 string
* Comes from W3 FAQ: Multilingual Forms
* Note: modified to include full ASCII range including control chars
* @see http://www.w3.org/International/questions/qa-forms-utf-8
* @param string to search
* @param string to replace bad bytes with (defaults to '?') - use ASCII
* @return string
* @package utf8
* @subpackage bad
*/
function utf8_bad_replace($str, $replace = '?') {
    $UTF8_BAD =
    '([\x00-\x7F]'.                          # ASCII (including control chars)
    '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
    '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
    '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
    '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
    '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
    '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
    '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
    '|(.{1}))';                              # invalid byte
    ob_start();
    while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
        if ( !isset($matches[2])) {
            echo $matches[0];
        } else {
            echo $replace;
        }
        $str = substr($str,strlen($matches[0]));
    }
    $result = ob_get_contents();
    ob_end_clean();
    return $result;
}

//--------------------------------------------------------------------
/**
* Return code from utf8_bad_identify() when a five octet sequence is detected.
* Note: 5 octets sequences are valid UTF-8 but are not supported by Unicode so
* do not represent a useful character
* @see utf8_bad_identify
* @package utf8
* @subpackage bad
*/
define('UTF8_BAD_5OCTET',1);

/**
* Return code from utf8_bad_identify() when a six octet sequence is detected.
* Note: 6 octets sequences are valid UTF-8 but are not supported by Unicode so
* do not represent a useful character
* @see utf8_bad_identify
* @package utf8
* @subpackage bad
*/
define('UTF8_BAD_6OCTET',2);

/**
* Return code from utf8_bad_identify().
* Invalid octet for use as start of multi-byte UTF-8 sequence
* @see utf8_bad_identify
* @package utf8
* @subpackage bad
*/
define('UTF8_BAD_SEQID',3);

/**
* Return code from utf8_bad_identify().
* From Unicode 3.1, non-shortest form is illegal
* @see utf8_bad_identify
* @package utf8
* @subpackage bad
*/
define('UTF8_BAD_NONSHORT',4);

/**
* Return code from utf8_bad_identify().
* From Unicode 3.2, surrogate characters are illegal
* @see utf8_bad_identify
* @package utf8
* @subpackage bad
*/
define('UTF8_BAD_SURROGATE',5);

/**
* Return code from utf8_bad_identify().
* Codepoints outside the Unicode range are illegal
* @see utf8_bad_identify
* @package utf8
* @subpackage bad
*/
define('UTF8_BAD_UNIOUTRANGE',6);

/**
* Return code from utf8_bad_identify().
* Incomplete multi-octet sequence
* Note: this is kind of a "catch-all"
* @see utf8_bad_identify
* @package utf8
* @subpackage bad
*/
define('UTF8_BAD_SEQINCOMPLETE',7);

//--------------------------------------------------------------------
/**
* Reports on the type of bad byte found in a UTF-8 string. Returns a
* status code on the first bad byte found
* @author <hsivonen@iki.fi>
* @param string UTF-8 encoded string
* @return mixed integer constant describing problem or FALSE if valid UTF-8
* @see utf8_bad_explain
* @see http://hsivonen.iki.fi/php-utf8/
* @package utf8
* @subpackage bad
*/
function utf8_bad_identify($str, &$i) {

    $mState = 0;     // cached expected number of octets after the current octet
                     // until the beginning of the next UTF8 character sequence
    $mUcs4  = 0;     // cached Unicode character
    $mBytes = 1;     // cached expected number of octets in the current sequence

    $len = strlen($str);

    for($i = 0; $i < $len; $i++) {

        $in = ord($str{$i});

        if ( $mState == 0) {

            // When mState is zero we expect either a US-ASCII character or a
            // multi-octet sequence.
            if (0 == (0x80 & ($in))) {
                // US-ASCII, pass straight through.
                $mBytes = 1;

            } else if (0xC0 == (0xE0 & ($in))) {
                // First octet of 2 octet sequence
                $mUcs4 = ($in);
                $mUcs4 = ($mUcs4 & 0x1F) << 6;
                $mState = 1;
                $mBytes = 2;

            } else if (0xE0 == (0xF0 & ($in))) {
                // First octet of 3 octet sequence
                $mUcs4 = ($in);
                $mUcs4 = ($mUcs4 & 0x0F) << 12;
                $mState = 2;
                $mBytes = 3;

            } else if (0xF0 == (0xF8 & ($in))) {
                // First octet of 4 octet sequence
                $mUcs4 = ($in);
                $mUcs4 = ($mUcs4 & 0x07) << 18;
                $mState = 3;
                $mBytes = 4;

            } else if (0xF8 == (0xFC & ($in))) {

                /* First octet of 5 octet sequence.
                *
                * This is illegal because the encoded codepoint must be either
                * (a) not the shortest form or
                * (b) outside the Unicode range of 0-0x10FFFF.
                */

                return UTF8_BAD_5OCTET;

            } else if (0xFC == (0xFE & ($in))) {

                // First octet of 6 octet sequence, see comments for 5 octet sequence.
                return UTF8_BAD_6OCTET;

            } else {
                // Current octet is neither in the US-ASCII range nor a legal first
                // octet of a multi-octet sequence.
                return UTF8_BAD_SEQID;

            }

        } else {

            // When mState is non-zero, we expect a continuation of the multi-octet
            // sequence
            if (0x80 == (0xC0 & ($in))) {

                // Legal continuation.
                $shift = ($mState - 1) * 6;
                $tmp = $in;
                $tmp = ($tmp & 0x0000003F) << $shift;
                $mUcs4 |= $tmp;

                /**
                * End of the multi-octet sequence. mUcs4 now contains the final
                * Unicode codepoint to be output
                */
                if (0 == --$mState) {

                    // From Unicode 3.1, non-shortest form is illegal
                    if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
                        ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
                        ((4 == $mBytes) && ($mUcs4 < 0x10000)) ) {
                        return UTF8_BAD_NONSHORT;

                    // From Unicode 3.2, surrogate characters are illegal
                    } else if (($mUcs4 & 0xFFFFF800) == 0xD800) {
                        return UTF8_BAD_SURROGATE;

                    // Codepoints outside the Unicode range are illegal
                    } else if ($mUcs4 > 0x10FFFF) {
                        return UTF8_BAD_UNIOUTRANGE;
                    }

                    //initialize UTF8 cache
                    $mState = 0;
                    $mUcs4  = 0;
                    $mBytes = 1;
                }

            } else {
                // ((0xC0 & (*in) != 0x80) && (mState != 0))
                // Incomplete multi-octet sequence.
                $i--;
                return UTF8_BAD_SEQINCOMPLETE;
            }
        }
    }

    if ( $mState != 0 ) {
        // Incomplete multi-octet sequence.
        $i--;
        return UTF8_BAD_SEQINCOMPLETE;
    }

    // No bad octets found
    $i = NULL;
    return FALSE;
}

//--------------------------------------------------------------------
/**
* Takes a return code from utf8_bad_identify() are returns a message
* (in English) explaining what the problem is.
* @param int return code from utf8_bad_identify
* @return mixed string message or FALSE if return code unknown
* @see utf8_bad_identify
* @package utf8
* @subpackage bad
*/
function utf8_bad_explain($code) {

    switch ($code) {

        case UTF8_BAD_5OCTET:
            return 'Five octet sequences are valid UTF-8 but are not supported by Unicode';
        break;

        case UTF8_BAD_6OCTET:
            return 'Six octet sequences are valid UTF-8 but are not supported by Unicode';
        break;

        case UTF8_BAD_SEQID:
            return 'Invalid octet for use as start of multi-byte UTF-8 sequence';
        break;

        case UTF8_BAD_NONSHORT:
            return 'From Unicode 3.1, non-shortest form is illegal';
        break;

        case UTF8_BAD_SURROGATE:
            return 'From Unicode 3.2, surrogate characters are illegal';
        break;

        case UTF8_BAD_UNIOUTRANGE:
            return 'Codepoints outside the Unicode range are illegal';
        break;

        case UTF8_BAD_SEQINCOMPLETE:
            return 'Incomplete multi-octet sequence';
        break;

    }

    trigger_error('Unknown error code: '.$code,E_USER_WARNING);
    return FALSE;

}
first commit 2020-01-02 22:20:31 +07:00			`<?php`
			`/**`
			`* @version $Id$`
			`* Tools for locating / replacing bad bytes in UTF-8 strings`
			`* The Original Code is Mozilla Communicator client code.`
			`* The Initial Developer of the Original Code is`
			`* Netscape Communications Corporation.`
			`* Portions created by the Initial Developer are Copyright (C) 1998`
			`* the Initial Developer. All Rights Reserved.`
			`* Ported to PHP by Henri Sivonen (http://hsivonen.iki.fi)`
			`* Slight modifications to fit with phputf8 library by Harry Fuecks (hfuecks gmail com)`
			`* @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUTF8ToUnicode.cpp`
			`* @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUnicodeToUTF8.cpp`
			`* @see http://hsivonen.iki.fi/php-utf8/`
			`* @package utf8`
			`* @subpackage bad`
			`* @see utf8_is_valid`
			`*/`

			`//--------------------------------------------------------------------`
			`/**`
			`* Locates the first bad byte in a UTF-8 string returning it's`
			`* byte index in the string`
			`* PCRE Pattern to locate bad bytes in a UTF-8 string`
			`* Comes from W3 FAQ: Multilingual Forms`
			`* Note: modified to include full ASCII range including control chars`
			`* @see http://www.w3.org/International/questions/qa-forms-utf-8`
			`* @param string`
			`* @return mixed integer byte index or FALSE if no bad found`
			`* @package utf8`
			`* @subpackage bad`
			`*/`
			`function utf8_bad_find($str) {`
			`$UTF8_BAD =`
			`'([\x00-\x7F]'. # ASCII (including control chars)`
			`'\|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte`
			`'\|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs`
			`'\|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte`
			`'\|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates`
			`'\|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3`
			`'\|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15`
			`'\|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16`
			`'\|(.{1}))'; # invalid byte`
			`$pos = 0;`
			`$badList = array();`
			`while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {`
			`$bytes = strlen($matches[0]);`
			`if ( isset($matches[2])) {`
			`return $pos;`
			`}`
			`$pos += $bytes;`
			`$str = substr($str,$bytes);`
			`}`
			`return FALSE;`
			`}`

			`//--------------------------------------------------------------------`
			`/**`
			`* Locates all bad bytes in a UTF-8 string and returns a list of their`
			`* byte index in the string`
			`* PCRE Pattern to locate bad bytes in a UTF-8 string`
			`* Comes from W3 FAQ: Multilingual Forms`
			`* Note: modified to include full ASCII range including control chars`
			`* @see http://www.w3.org/International/questions/qa-forms-utf-8`
			`* @param string`
			`* @return mixed array of integers or FALSE if no bad found`
			`* @package utf8`
			`* @subpackage bad`
			`*/`
			`function utf8_bad_findall($str) {`
			`$UTF8_BAD =`
			`'([\x00-\x7F]'. # ASCII (including control chars)`
			`'\|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte`
			`'\|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs`
			`'\|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte`
			`'\|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates`
			`'\|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3`
			`'\|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15`
			`'\|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16`
			`'\|(.{1}))'; # invalid byte`
			`$pos = 0;`
			`$badList = array();`
			`while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {`
			`$bytes = strlen($matches[0]);`
			`if ( isset($matches[2])) {`
			`$badList[] = $pos;`
			`}`
			`$pos += $bytes;`
			`$str = substr($str,$bytes);`
			`}`
			`if ( count($badList) > 0 ) {`
			`return $badList;`
			`}`
			`return FALSE;`
			`}`

			`//--------------------------------------------------------------------`
			`/**`
			`* Strips out any bad bytes from a UTF-8 string and returns the rest`
			`* PCRE Pattern to locate bad bytes in a UTF-8 string`
			`* Comes from W3 FAQ: Multilingual Forms`
			`* Note: modified to include full ASCII range including control chars`
			`* @see http://www.w3.org/International/questions/qa-forms-utf-8`
			`* @param string`
			`* @return string`
			`* @package utf8`
			`* @subpackage bad`
			`*/`
			`function utf8_bad_strip($str) {`
			`$UTF8_BAD =`
			`'([\x00-\x7F]'. # ASCII (including control chars)`
			`'\|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte`
			`'\|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs`
			`'\|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte`
			`'\|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates`
			`'\|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3`
			`'\|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15`
			`'\|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16`
			`'\|(.{1}))'; # invalid byte`
			`ob_start();`
			`while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {`
			`if ( !isset($matches[2])) {`
			`echo $matches[0];`
			`}`
			`$str = substr($str,strlen($matches[0]));`
			`}`
			`$result = ob_get_contents();`
			`ob_end_clean();`
			`return $result;`
			`}`

			`//--------------------------------------------------------------------`
			`/**`
			`* Replace bad bytes with an alternative character - ASCII character`
			`* recommended is replacement char`
			`* PCRE Pattern to locate bad bytes in a UTF-8 string`
			`* Comes from W3 FAQ: Multilingual Forms`
			`* Note: modified to include full ASCII range including control chars`
			`* @see http://www.w3.org/International/questions/qa-forms-utf-8`
			`* @param string to search`
			`* @param string to replace bad bytes with (defaults to '?') - use ASCII`
			`* @return string`
			`* @package utf8`
			`* @subpackage bad`
			`*/`
			`function utf8_bad_replace($str, $replace = '?') {`
			`$UTF8_BAD =`
			`'([\x00-\x7F]'. # ASCII (including control chars)`
			`'\|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte`
			`'\|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs`
			`'\|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte`
			`'\|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates`
			`'\|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3`
			`'\|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15`
			`'\|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16`
			`'\|(.{1}))'; # invalid byte`
			`ob_start();`
			`while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {`
			`if ( !isset($matches[2])) {`
			`echo $matches[0];`
			`} else {`
			`echo $replace;`
			`}`
			`$str = substr($str,strlen($matches[0]));`
			`}`
			`$result = ob_get_contents();`
			`ob_end_clean();`
			`return $result;`
			`}`

			`//--------------------------------------------------------------------`
			`/**`
			`* Return code from utf8_bad_identify() when a five octet sequence is detected.`
			`* Note: 5 octets sequences are valid UTF-8 but are not supported by Unicode so`
			`* do not represent a useful character`
			`* @see utf8_bad_identify`
			`* @package utf8`
			`* @subpackage bad`
			`*/`
			`define('UTF8_BAD_5OCTET',1);`

			`/**`
			`* Return code from utf8_bad_identify() when a six octet sequence is detected.`
			`* Note: 6 octets sequences are valid UTF-8 but are not supported by Unicode so`
			`* do not represent a useful character`
			`* @see utf8_bad_identify`
			`* @package utf8`
			`* @subpackage bad`
			`*/`
			`define('UTF8_BAD_6OCTET',2);`

			`/**`
			`* Return code from utf8_bad_identify().`
			`* Invalid octet for use as start of multi-byte UTF-8 sequence`
			`* @see utf8_bad_identify`
			`* @package utf8`
			`* @subpackage bad`
			`*/`
			`define('UTF8_BAD_SEQID',3);`

			`/**`
			`* Return code from utf8_bad_identify().`
			`* From Unicode 3.1, non-shortest form is illegal`
			`* @see utf8_bad_identify`
			`* @package utf8`
			`* @subpackage bad`
			`*/`
			`define('UTF8_BAD_NONSHORT',4);`

			`/**`
			`* Return code from utf8_bad_identify().`
			`* From Unicode 3.2, surrogate characters are illegal`
			`* @see utf8_bad_identify`
			`* @package utf8`
			`* @subpackage bad`
			`*/`
			`define('UTF8_BAD_SURROGATE',5);`

			`/**`
			`* Return code from utf8_bad_identify().`
			`* Codepoints outside the Unicode range are illegal`
			`* @see utf8_bad_identify`
			`* @package utf8`
			`* @subpackage bad`
			`*/`
			`define('UTF8_BAD_UNIOUTRANGE',6);`

			`/**`
			`* Return code from utf8_bad_identify().`
			`* Incomplete multi-octet sequence`
			`* Note: this is kind of a "catch-all"`
			`* @see utf8_bad_identify`
			`* @package utf8`
			`* @subpackage bad`
			`*/`
			`define('UTF8_BAD_SEQINCOMPLETE',7);`

			`//--------------------------------------------------------------------`
			`/**`
			`* Reports on the type of bad byte found in a UTF-8 string. Returns a`
			`* status code on the first bad byte found`
			`* @author <hsivonen@iki.fi>`
			`* @param string UTF-8 encoded string`
			`* @return mixed integer constant describing problem or FALSE if valid UTF-8`
			`* @see utf8_bad_explain`
			`* @see http://hsivonen.iki.fi/php-utf8/`
			`* @package utf8`
			`* @subpackage bad`
			`*/`
			`function utf8_bad_identify($str, &$i) {`

			`$mState = 0; // cached expected number of octets after the current octet`
			`// until the beginning of the next UTF8 character sequence`
			`$mUcs4 = 0; // cached Unicode character`
			`$mBytes = 1; // cached expected number of octets in the current sequence`

			`$len = strlen($str);`

			`for($i = 0; $i < $len; $i++) {`

			`$in = ord($str{$i});`

			`if ( $mState == 0) {`

			`// When mState is zero we expect either a US-ASCII character or a`
			`// multi-octet sequence.`
			`if (0 == (0x80 & ($in))) {`
			`// US-ASCII, pass straight through.`
			`$mBytes = 1;`

			`} else if (0xC0 == (0xE0 & ($in))) {`
			`// First octet of 2 octet sequence`
			`$mUcs4 = ($in);`
			`$mUcs4 = ($mUcs4 & 0x1F) << 6;`
			`$mState = 1;`
			`$mBytes = 2;`

			`} else if (0xE0 == (0xF0 & ($in))) {`
			`// First octet of 3 octet sequence`
			`$mUcs4 = ($in);`
			`$mUcs4 = ($mUcs4 & 0x0F) << 12;`
			`$mState = 2;`
			`$mBytes = 3;`

			`} else if (0xF0 == (0xF8 & ($in))) {`
			`// First octet of 4 octet sequence`
			`$mUcs4 = ($in);`
			`$mUcs4 = ($mUcs4 & 0x07) << 18;`
			`$mState = 3;`
			`$mBytes = 4;`

			`} else if (0xF8 == (0xFC & ($in))) {`

			`/* First octet of 5 octet sequence.`
			`*`
			`* This is illegal because the encoded codepoint must be either`
			`* (a) not the shortest form or`
			`* (b) outside the Unicode range of 0-0x10FFFF.`
			`*/`

			`return UTF8_BAD_5OCTET;`

			`} else if (0xFC == (0xFE & ($in))) {`

			`// First octet of 6 octet sequence, see comments for 5 octet sequence.`
			`return UTF8_BAD_6OCTET;`

			`} else {`
			`// Current octet is neither in the US-ASCII range nor a legal first`
			`// octet of a multi-octet sequence.`
			`return UTF8_BAD_SEQID;`

			`}`

			`} else {`

			`// When mState is non-zero, we expect a continuation of the multi-octet`
			`// sequence`
			`if (0x80 == (0xC0 & ($in))) {`

			`// Legal continuation.`
			`$shift = ($mState - 1) * 6;`
			`$tmp = $in;`
			`$tmp = ($tmp & 0x0000003F) << $shift;`
			`$mUcs4 \|= $tmp;`

			`/**`
			`* End of the multi-octet sequence. mUcs4 now contains the final`
			`* Unicode codepoint to be output`
			`*/`
			`if (0 == --$mState) {`

			`// From Unicode 3.1, non-shortest form is illegal`
			`if (((2 == $mBytes) && ($mUcs4 < 0x0080)) \|\|`
			`((3 == $mBytes) && ($mUcs4 < 0x0800)) \|\|`
			`((4 == $mBytes) && ($mUcs4 < 0x10000)) ) {`
			`return UTF8_BAD_NONSHORT;`

			`// From Unicode 3.2, surrogate characters are illegal`
			`} else if (($mUcs4 & 0xFFFFF800) == 0xD800) {`
			`return UTF8_BAD_SURROGATE;`

			`// Codepoints outside the Unicode range are illegal`
			`} else if ($mUcs4 > 0x10FFFF) {`
			`return UTF8_BAD_UNIOUTRANGE;`
			`}`

			`//initialize UTF8 cache`
			`$mState = 0;`
			`$mUcs4 = 0;`
			`$mBytes = 1;`
			`}`

			`} else {`
			`// ((0xC0 & (*in) != 0x80) && (mState != 0))`
			`// Incomplete multi-octet sequence.`
			`$i--;`
			`return UTF8_BAD_SEQINCOMPLETE;`
			`}`
			`}`
			`}`

			`if ( $mState != 0 ) {`
			`// Incomplete multi-octet sequence.`
			`$i--;`
			`return UTF8_BAD_SEQINCOMPLETE;`
			`}`

			`// No bad octets found`
			`$i = NULL;`
			`return FALSE;`
			`}`

			`//--------------------------------------------------------------------`
			`/**`
			`* Takes a return code from utf8_bad_identify() are returns a message`
			`* (in English) explaining what the problem is.`
			`* @param int return code from utf8_bad_identify`
			`* @return mixed string message or FALSE if return code unknown`
			`* @see utf8_bad_identify`
			`* @package utf8`
			`* @subpackage bad`
			`*/`
			`function utf8_bad_explain($code) {`

			`switch ($code) {`

			`case UTF8_BAD_5OCTET:`
			`return 'Five octet sequences are valid UTF-8 but are not supported by Unicode';`
			`break;`

			`case UTF8_BAD_6OCTET:`
			`return 'Six octet sequences are valid UTF-8 but are not supported by Unicode';`
			`break;`

			`case UTF8_BAD_SEQID:`
			`return 'Invalid octet for use as start of multi-byte UTF-8 sequence';`
			`break;`

			`case UTF8_BAD_NONSHORT:`
			`return 'From Unicode 3.1, non-shortest form is illegal';`
			`break;`

			`case UTF8_BAD_SURROGATE:`
			`return 'From Unicode 3.2, surrogate characters are illegal';`
			`break;`

			`case UTF8_BAD_UNIOUTRANGE:`
			`return 'Codepoints outside the Unicode range are illegal';`
			`break;`

			`case UTF8_BAD_SEQINCOMPLETE:`
			`return 'Incomplete multi-octet sequence';`
			`break;`

			`}`

			`trigger_error('Unknown error code: '.$code,E_USER_WARNING);`
			`return FALSE;`

			`}`