300 lines
11 KiB
PHP
300 lines
11 KiB
PHP
|
<?php
|
||
|
/**
|
||
|
* UCTC - The Unicode Transcoder
|
||
|
*
|
||
|
* Converts between various flavours of Unicode representations like UCS-4 or UTF-8
|
||
|
* Supported schemes:
|
||
|
* - UCS-4 Little Endian / Big Endian / Array (partially)
|
||
|
* - UTF-16 Little Endian / Big Endian (not yet)
|
||
|
* - UTF-8
|
||
|
* - UTF-7
|
||
|
* - UTF-7 IMAP (modified UTF-7)
|
||
|
*
|
||
|
* @package phlyMail Nahariya 4.0+ Default branch
|
||
|
* @author Matthias Sommerfeld <mso@phlyLabs.de>
|
||
|
* @copyright 2003-2009 phlyLabs Berlin, http://phlylabs.de
|
||
|
* @version 0.0.6 2009-05-10
|
||
|
*/
|
||
|
class uctc {
|
||
|
private static $mechs = array('ucs4', /*'ucs4le', 'ucs4be', */'ucs4array', /*'utf16', 'utf16le', 'utf16be', */'utf8', 'utf7', 'utf7imap');
|
||
|
private static $allow_overlong = false;
|
||
|
private static $safe_mode;
|
||
|
private static $safe_char;
|
||
|
|
||
|
/**
|
||
|
* The actual conversion routine
|
||
|
*
|
||
|
* @param mixed $data The data to convert, usually a string, array when converting from UCS-4 array
|
||
|
* @param string $from Original encoding of the data
|
||
|
* @param string $to Target encoding of the data
|
||
|
* @param bool $safe_mode SafeMode tries to correct invalid codepoints
|
||
|
* @return mixed False on failure, String or array on success, depending on target encoding
|
||
|
* @access public
|
||
|
* @since 0.0.1
|
||
|
*/
|
||
|
public static function convert($data, $from, $to, $safe_mode = false, $safe_char = 0xFFFC)
|
||
|
{
|
||
|
self::$safe_mode = ($safe_mode) ? true : false;
|
||
|
self::$safe_char = ($safe_char) ? $safe_char : 0xFFFC;
|
||
|
if (self::$safe_mode) self::$allow_overlong = true;
|
||
|
if (!in_array($from, self::$mechs)) throw new Exception('Invalid input format specified');
|
||
|
if (!in_array($to, self::$mechs)) throw new Exception('Invalid output format specified');
|
||
|
if ($from != 'ucs4array') eval('$data = self::'.$from.'_ucs4array($data);');
|
||
|
if ($to != 'ucs4array') eval('$data = self::ucs4array_'.$to.'($data);');
|
||
|
return $data;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* This converts an UTF-8 encoded string to its UCS-4 representation
|
||
|
*
|
||
|
* @param string $input The UTF-8 string to convert
|
||
|
* @return array Array of 32bit values representing each codepoint
|
||
|
* @access private
|
||
|
*/
|
||
|
private static function utf8_ucs4array($input)
|
||
|
{
|
||
|
$output = array();
|
||
|
$out_len = 0;
|
||
|
$inp_len = strlen($input);
|
||
|
$mode = 'next';
|
||
|
$test = 'none';
|
||
|
for ($k = 0; $k < $inp_len; ++$k) {
|
||
|
$v = ord($input{$k}); // Extract byte from input string
|
||
|
|
||
|
if ($v < 128) { // We found an ASCII char - put into stirng as is
|
||
|
$output[$out_len] = $v;
|
||
|
++$out_len;
|
||
|
if ('add' == $mode) {
|
||
|
if (self::$safe_mode) {
|
||
|
$output[$out_len-2] = self::$safe_char;
|
||
|
$mode = 'next';
|
||
|
} else {
|
||
|
throw new Exception('Conversion from UTF-8 to UCS-4 failed: malformed input at byte '.$k);
|
||
|
}
|
||
|
}
|
||
|
continue;
|
||
|
}
|
||
|
if ('next' == $mode) { // Try to find the next start byte; determine the width of the Unicode char
|
||
|
$start_byte = $v;
|
||
|
$mode = 'add';
|
||
|
$test = 'range';
|
||
|
if ($v >> 5 == 6) { // &110xxxxx 10xxxxx
|
||
|
$next_byte = 0; // Tells, how many times subsequent bitmasks must rotate 6bits to the left
|
||
|
$v = ($v - 192) << 6;
|
||
|
} elseif ($v >> 4 == 14) { // &1110xxxx 10xxxxxx 10xxxxxx
|
||
|
$next_byte = 1;
|
||
|
$v = ($v - 224) << 12;
|
||
|
} elseif ($v >> 3 == 30) { // &11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||
|
$next_byte = 2;
|
||
|
$v = ($v - 240) << 18;
|
||
|
} elseif (self::$safe_mode) {
|
||
|
$mode = 'next';
|
||
|
$output[$out_len] = self::$safe_char;
|
||
|
++$out_len;
|
||
|
continue;
|
||
|
} else {
|
||
|
throw new Exception('This might be UTF-8, but I don\'t understand it at byte '.$k);
|
||
|
}
|
||
|
if ($inp_len-$k-$next_byte < 2) {
|
||
|
$output[$out_len] = self::$safe_char;
|
||
|
$mode = 'no';
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
if ('add' == $mode) {
|
||
|
$output[$out_len] = (int) $v;
|
||
|
++$out_len;
|
||
|
continue;
|
||
|
}
|
||
|
}
|
||
|
if ('add' == $mode) {
|
||
|
if (!self::$allow_overlong && $test == 'range') {
|
||
|
$test = 'none';
|
||
|
if (($v < 0xA0 && $start_byte == 0xE0) || ($v < 0x90 && $start_byte == 0xF0) || ($v > 0x8F && $start_byte == 0xF4)) {
|
||
|
throw new Exception('Bogus UTF-8 character detected (out of legal range) at byte '.$k);
|
||
|
}
|
||
|
}
|
||
|
if ($v >> 6 == 2) { // Bit mask must be 10xxxxxx
|
||
|
$v = ($v-128) << ($next_byte*6);
|
||
|
$output[($out_len-1)] += $v;
|
||
|
--$next_byte;
|
||
|
} else {
|
||
|
if (self::$safe_mode) {
|
||
|
$output[$out_len-1] = ord(self::$safe_char);
|
||
|
$k--;
|
||
|
$mode = 'next';
|
||
|
continue;
|
||
|
} else {
|
||
|
throw new Exception('Conversion from UTF-8 to UCS-4 failed: malformed input at byte '.$k);
|
||
|
}
|
||
|
}
|
||
|
if ($next_byte < 0) {
|
||
|
$mode = 'next';
|
||
|
}
|
||
|
}
|
||
|
} // for
|
||
|
return $output;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Convert UCS-4 string into UTF-8 string
|
||
|
* See utf8_ucs4array() for details
|
||
|
* @access private
|
||
|
*/
|
||
|
private static function ucs4array_utf8($input)
|
||
|
{
|
||
|
$output = '';
|
||
|
foreach ($input as $v) {
|
||
|
if ($v < 128) { // 7bit are transferred literally
|
||
|
$output .= chr($v);
|
||
|
} elseif ($v < (1 << 11)) { // 2 bytes
|
||
|
$output .= chr(192+($v >> 6)).chr(128+($v & 63));
|
||
|
} elseif ($v < (1 << 16)) { // 3 bytes
|
||
|
$output .= chr(224+($v >> 12)).chr(128+(($v >> 6) & 63)).chr(128+($v & 63));
|
||
|
} elseif ($v < (1 << 21)) { // 4 bytes
|
||
|
$output .= chr(240+($v >> 18)).chr(128+(($v >> 12) & 63)).chr(128+(($v >> 6) & 63)).chr(128+($v & 63));
|
||
|
} elseif (self::$safe_mode) {
|
||
|
$output .= self::$safe_char;
|
||
|
} else {
|
||
|
throw new Exception('Conversion from UCS-4 to UTF-8 failed: malformed input at byte '.$k);
|
||
|
}
|
||
|
}
|
||
|
return $output;
|
||
|
}
|
||
|
|
||
|
private static function utf7imap_ucs4array($input)
|
||
|
{
|
||
|
return self::utf7_ucs4array(str_replace(',', '/', $input), '&');
|
||
|
}
|
||
|
|
||
|
private static function utf7_ucs4array($input, $sc = '+')
|
||
|
{
|
||
|
$output = array();
|
||
|
$out_len = 0;
|
||
|
$inp_len = strlen($input);
|
||
|
$mode = 'd';
|
||
|
$b64 = '';
|
||
|
|
||
|
for ($k = 0; $k < $inp_len; ++$k) {
|
||
|
$c = $input{$k};
|
||
|
if (0 == ord($c)) continue; // Ignore zero bytes
|
||
|
if ('b' == $mode) {
|
||
|
// Sequence got terminated
|
||
|
if (!preg_match('![A-Za-z0-9/'.preg_quote($sc, '!').']!', $c)) {
|
||
|
if ('-' == $c) {
|
||
|
if ($b64 == '') {
|
||
|
$output[$out_len] = ord($sc);
|
||
|
$out_len++;
|
||
|
$mode = 'd';
|
||
|
continue;
|
||
|
}
|
||
|
}
|
||
|
$tmp = base64_decode($b64);
|
||
|
$tmp = substr($tmp, -1 * (strlen($tmp) % 2));
|
||
|
for ($i = 0; $i < strlen($tmp); $i++) {
|
||
|
if ($i % 2) {
|
||
|
$output[$out_len] += ord($tmp{$i});
|
||
|
$out_len++;
|
||
|
} else {
|
||
|
$output[$out_len] = ord($tmp{$i}) << 8;
|
||
|
}
|
||
|
}
|
||
|
$mode = 'd';
|
||
|
$b64 = '';
|
||
|
continue;
|
||
|
} else {
|
||
|
$b64 .= $c;
|
||
|
}
|
||
|
}
|
||
|
if ('d' == $mode) {
|
||
|
if ($sc == $c) {
|
||
|
$mode = 'b';
|
||
|
continue;
|
||
|
}
|
||
|
$output[$out_len] = ord($c);
|
||
|
$out_len++;
|
||
|
}
|
||
|
}
|
||
|
return $output;
|
||
|
}
|
||
|
|
||
|
private static function ucs4array_utf7imap($input)
|
||
|
{
|
||
|
return str_replace('/', ',', self::ucs4array_utf7($input, '&'));
|
||
|
}
|
||
|
|
||
|
private static function ucs4array_utf7($input, $sc = '+')
|
||
|
{
|
||
|
$output = '';
|
||
|
$mode = 'd';
|
||
|
$b64 = '';
|
||
|
while (true) {
|
||
|
$v = (!empty($input)) ? array_shift($input) : false;
|
||
|
$is_direct = (false !== $v) ? (0x20 <= $v && $v <= 0x7e && $v != ord($sc)) : true;
|
||
|
if ($mode == 'b') {
|
||
|
if ($is_direct) {
|
||
|
if ($b64 == chr(0).$sc) {
|
||
|
$output .= $sc.'-';
|
||
|
$b64 = '';
|
||
|
} elseif ($b64) {
|
||
|
$output .= $sc.str_replace('=', '', base64_encode($b64)).'-';
|
||
|
$b64 = '';
|
||
|
}
|
||
|
$mode = 'd';
|
||
|
} elseif (false !== $v) {
|
||
|
$b64 .= chr(($v >> 8) & 255). chr($v & 255);
|
||
|
}
|
||
|
}
|
||
|
if ($mode == 'd' && false !== $v) {
|
||
|
if ($is_direct) {
|
||
|
$output .= chr($v);
|
||
|
} else {
|
||
|
$b64 = chr(($v >> 8) & 255). chr($v & 255);
|
||
|
$mode = 'b';
|
||
|
}
|
||
|
}
|
||
|
if (false === $v && $b64 == '') break;
|
||
|
}
|
||
|
return $output;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Convert UCS-4 array into UCS-4 string (Little Endian at the moment)
|
||
|
* @access private
|
||
|
*/
|
||
|
private static function ucs4array_ucs4($input)
|
||
|
{
|
||
|
$output = '';
|
||
|
foreach ($input as $v) {
|
||
|
$output .= chr(($v >> 24) & 255).chr(($v >> 16) & 255).chr(($v >> 8) & 255).chr($v & 255);
|
||
|
}
|
||
|
return $output;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Convert UCS-4 string (LE in the moment) into UCS-4 garray
|
||
|
* @access private
|
||
|
*/
|
||
|
private static function ucs4_ucs4array($input)
|
||
|
{
|
||
|
$output = array();
|
||
|
|
||
|
$inp_len = strlen($input);
|
||
|
// Input length must be dividable by 4
|
||
|
if ($inp_len % 4) {
|
||
|
throw new Exception('Input UCS4 string is broken');
|
||
|
}
|
||
|
// Empty input - return empty output
|
||
|
if (!$inp_len) return $output;
|
||
|
|
||
|
for ($i = 0, $out_len = -1; $i < $inp_len; ++$i) {
|
||
|
if (!($i % 4)) { // Increment output position every 4 input bytes
|
||
|
$out_len++;
|
||
|
$output[$out_len] = 0;
|
||
|
}
|
||
|
$output[$out_len] += ord($input{$i}) << (8 * (3 - ($i % 4) ) );
|
||
|
}
|
||
|
return $output;
|
||
|
}
|
||
|
}
|
||
|
?>
|