174 lines
5.1 KiB
PHP
174 lines
5.1 KiB
PHP
|
<?php
|
||
|
/**
|
||
|
* Locate a byte index given a UTF-8 character index
|
||
|
* @version $Id$
|
||
|
* @package utf8
|
||
|
* @subpackage position
|
||
|
*/
|
||
|
|
||
|
//--------------------------------------------------------------------
|
||
|
/**
|
||
|
* Given a string and a character index in the string, in
|
||
|
* terms of the UTF-8 character position, returns the byte
|
||
|
* index of that character. Can be useful when you want to
|
||
|
* PHP's native string functions but we warned, locating
|
||
|
* the byte can be expensive
|
||
|
* Takes variable number of parameters - first must be
|
||
|
* the search string then 1 to n UTF-8 character positions
|
||
|
* to obtain byte indexes for - it is more efficient to search
|
||
|
* the string for multiple characters at once, than make
|
||
|
* repeated calls to this function
|
||
|
*
|
||
|
* @author Chris Smith<chris@jalakai.co.uk>
|
||
|
* @param string string to locate index in
|
||
|
* @param int (n times)
|
||
|
* @return mixed - int if only one input int, array if more
|
||
|
* @return boolean TRUE if it's all ASCII
|
||
|
* @package utf8
|
||
|
* @subpackage position
|
||
|
*/
|
||
|
function utf8_byte_position() {
|
||
|
|
||
|
$args = func_get_args();
|
||
|
$str =& array_shift($args);
|
||
|
if (!is_string($str)) return false;
|
||
|
|
||
|
$result = array();
|
||
|
|
||
|
// trivial byte index, character offset pair
|
||
|
$prev = array(0,0);
|
||
|
|
||
|
// use a short piece of str to estimate bytes per character
|
||
|
// $i (& $j) -> byte indexes into $str
|
||
|
$i = utf8_locate_next_chr($str, 300);
|
||
|
|
||
|
// $c -> character offset into $str
|
||
|
$c = strlen(utf8_decode(substr($str,0,$i)));
|
||
|
|
||
|
// deal with arguments from lowest to highest
|
||
|
sort($args);
|
||
|
|
||
|
foreach ($args as $offset) {
|
||
|
// sanity checks FIXME
|
||
|
|
||
|
// 0 is an easy check
|
||
|
if ($offset == 0) { $result[] = 0; continue; }
|
||
|
|
||
|
// ensure no endless looping
|
||
|
$safety_valve = 50;
|
||
|
|
||
|
do {
|
||
|
|
||
|
if ( ($c - $prev[1]) == 0 ) {
|
||
|
// Hack: gone past end of string
|
||
|
$error = 0;
|
||
|
$i = strlen($str);
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
$j = $i + (int)(($offset-$c) * ($i - $prev[0]) / ($c - $prev[1]));
|
||
|
|
||
|
// correct to utf8 character boundary
|
||
|
$j = utf8_locate_next_chr($str, $j);
|
||
|
|
||
|
// save the index, offset for use next iteration
|
||
|
$prev = array($i,$c);
|
||
|
|
||
|
if ($j > $i) {
|
||
|
// determine new character offset
|
||
|
$c += strlen(utf8_decode(substr($str,$i,$j-$i)));
|
||
|
} else {
|
||
|
// ditto
|
||
|
$c -= strlen(utf8_decode(substr($str,$j,$i-$j)));
|
||
|
}
|
||
|
|
||
|
$error = abs($c-$offset);
|
||
|
|
||
|
// ready for next time around
|
||
|
$i = $j;
|
||
|
|
||
|
// from 7 it is faster to iterate over the string
|
||
|
} while ( ($error > 7) && --$safety_valve) ;
|
||
|
|
||
|
if ($error && $error <= 7) {
|
||
|
|
||
|
if ($c < $offset) {
|
||
|
// move up
|
||
|
while ($error--) { $i = utf8_locate_next_chr($str,++$i); }
|
||
|
} else {
|
||
|
// move down
|
||
|
while ($error--) { $i = utf8_locate_current_chr($str,--$i); }
|
||
|
}
|
||
|
|
||
|
// ready for next arg
|
||
|
$c = $offset;
|
||
|
}
|
||
|
$result[] = $i;
|
||
|
}
|
||
|
|
||
|
if ( count($result) == 1 ) {
|
||
|
return $result[0];
|
||
|
}
|
||
|
|
||
|
return $result;
|
||
|
}
|
||
|
|
||
|
//--------------------------------------------------------------------
|
||
|
/**
|
||
|
* Given a string and any byte index, returns the byte index
|
||
|
* of the start of the current UTF-8 character, relative to supplied
|
||
|
* position. If the current character begins at the same place as the
|
||
|
* supplied byte index, that byte index will be returned. Otherwise
|
||
|
* this function will step backwards, looking for the index where
|
||
|
* curent UTF-8 character begins
|
||
|
* @author Chris Smith<chris@jalakai.co.uk>
|
||
|
* @param string
|
||
|
* @param int byte index in the string
|
||
|
* @return int byte index of start of next UTF-8 character
|
||
|
* @package utf8
|
||
|
* @subpackage position
|
||
|
*/
|
||
|
function utf8_locate_current_chr( &$str, $idx ) {
|
||
|
|
||
|
if ($idx <= 0) return 0;
|
||
|
|
||
|
$limit = strlen($str);
|
||
|
if ($idx >= $limit) return $limit;
|
||
|
|
||
|
// Binary value for any byte after the first in a multi-byte UTF-8 character
|
||
|
// will be like 10xxxxxx so & 0xC0 can be used to detect this kind
|
||
|
// of byte - assuming well formed UTF-8
|
||
|
while ($idx && ((ord($str[$idx]) & 0xC0) == 0x80)) $idx--;
|
||
|
|
||
|
return $idx;
|
||
|
}
|
||
|
|
||
|
//--------------------------------------------------------------------
|
||
|
/**
|
||
|
* Given a string and any byte index, returns the byte index
|
||
|
* of the start of the next UTF-8 character, relative to supplied
|
||
|
* position. If the next character begins at the same place as the
|
||
|
* supplied byte index, that byte index will be returned.
|
||
|
* @author Chris Smith<chris@jalakai.co.uk>
|
||
|
* @param string
|
||
|
* @param int byte index in the string
|
||
|
* @return int byte index of start of next UTF-8 character
|
||
|
* @package utf8
|
||
|
* @subpackage position
|
||
|
*/
|
||
|
function utf8_locate_next_chr( &$str, $idx ) {
|
||
|
|
||
|
if ($idx <= 0) return 0;
|
||
|
|
||
|
$limit = strlen($str);
|
||
|
if ($idx >= $limit) return $limit;
|
||
|
|
||
|
// Binary value for any byte after the first in a multi-byte UTF-8 character
|
||
|
// will be like 10xxxxxx so & 0xC0 can be used to detect this kind
|
||
|
// of byte - assuming well formed UTF-8
|
||
|
while (($idx < $limit) && ((ord($str[$idx]) & 0xC0) == 0x80)) $idx++;
|
||
|
|
||
|
return $idx;
|
||
|
}
|
||
|
|