2020-01-02 22:20:31 +07:00
* @package Joomla.Administrator
* @subpackage com_finder
* @copyright Copyright (C) 2005 - 2013 Open Source Matters, Inc. All rights reserved.
* @license GNU General Public License version 2 or later; see LICENSE
defined('_JEXEC') or die;
JLoader::register('FinderIndexerParser', __DIR__ . '/parser.php');
JLoader::register('FinderIndexerStemmer', __DIR__ . '/stemmer.php');
JLoader::register('FinderIndexerToken', __DIR__ . '/token.php');
* Helper class for the Finder indexer package.
* @package Joomla.Administrator
* @subpackage com_finder
* @since 2.5
class FinderIndexerHelper
* The token stemmer object. The stemmer is set by whatever class
* wishes to use it but it must be an instance of FinderIndexerStemmer.
* @var FinderIndexerStemmer
* @since 2.5
public static $stemmer;
* Method to parse input into plain text.
* @param string $input The raw input.
* @param string $format The format of the input. [optional]
* @return string The parsed input.
* @since 2.5
* @throws Exception on invalid parser.
public static function parse($input, $format = 'html')
// Get a parser for the specified format and parse the input.
return FinderIndexerParser::getInstance($format)->parse($input);
* Method to tokenize a text string.
* @param string $input The input to tokenize.
* @param string $lang The language of the input.
* @param boolean $phrase Flag to indicate whether input could be a phrase. [optional]
* @return array An array of FinderIndexerToken objects.
* @since 2.5
public static function tokenize($input, $lang, $phrase = false)
static $cache;
$store = JString::strlen($input) < 128 ? md5($input . '::' . $lang . '::' . $phrase) : null;
// Check if the string has been tokenized already.
if ($store && isset($cache[$store]))
return $cache[$store];
$tokens = array();
$quotes = html_entity_decode('&#8216;&#8217;&#39;', ENT_QUOTES, 'UTF-8');
// Get the simple language key.
$lang = self::getPrimaryLanguage($lang);
* Parsing the string input into terms is a multi-step process.
* Regexes:
* 1. Remove everything except letters, numbers, quotes, apostrophe, plus, dash, period, and comma.
* 2. Remove plus, dash, period, and comma characters located before letter characters.
* 3. Remove plus, dash, period, and comma characters located after other characters.
* 4. Remove plus, period, and comma characters enclosed in alphabetical characters. Ungreedy.
* 5. Remove orphaned apostrophe, plus, dash, period, and comma characters.
* 6. Remove orphaned quote characters.
* 7. Replace the assorted single quotation marks with the ASCII standard single quotation.
* 8. Remove multiple space characters and replaces with a single space.
$input = JString::strtolower($input);
$input = preg_replace('#[^\pL\pM\pN\p{Pi}\p{Pf}\'+-.,]+#mui', ' ', $input);
$input = preg_replace('#(^|\s)[+-.,]+([\pL\pM]+)#mui', ' $1', $input);
$input = preg_replace('#([\pL\pM\pN]+)[+-.,]+(\s|$)#mui', '$1 ', $input);
$input = preg_replace('#([\pL\pM]+)[+.,]+([\pL\pM]+)#muiU', '$1 $2', $input);
$input = preg_replace('#(^|\s)[\'+-.,]+(\s|$)#mui', ' ', $input);
$input = preg_replace('#(^|\s)[\p{Pi}\p{Pf}]+(\s|$)#mui', ' ', $input);
$input = preg_replace('#[' . $quotes . ']+#mui', '\'', $input);
$input = preg_replace('#\s+#mui', ' ', $input);
$input = JString::trim($input);
// Explode the normalized string to get the terms.
$terms = explode(' ', $input);
* If we have Unicode support and are dealing with Chinese text, Chinese
* has to be handled specially because there are not necessarily any spaces
* between the "words". So, we have to test if the words belong to the Chinese
* character set and if so, explode them into single glyphs or "words".
if ($lang === 'zh')
// Iterate through the terms and test if they contain Chinese.
for ($i = 0, $n = count($terms); $i < $n; $i++)
$charMatches = array();
$charCount = preg_match_all('#[\p{Han}]#mui', $terms[$i], $charMatches);
// Split apart any groups of Chinese characters.
for ($j = 0; $j < $charCount; $j++)
$tSplit = JString::str_ireplace($charMatches[0][$j], '', $terms[$i], false);
if (!empty($tSplit))
$terms[$i] = $tSplit;
$terms[] = $charMatches[0][$j];
// Reset array keys.
$terms = array_values($terms);
* If we have to handle the input as a phrase, that means we don't
* tokenize the individual terms and we do not create the two and three
* term combinations. The phrase must contain more than one word!
if ($phrase === true && count($terms) > 1)
// Create tokens from the phrase.
$tokens[] = new FinderIndexerToken($terms, $lang);
// Create tokens from the terms.
for ($i = 0, $n = count($terms); $i < $n; $i++)
$tokens[] = new FinderIndexerToken($terms[$i], $lang);
// Create two and three word phrase tokens from the individual words.
for ($i = 0, $n = count($tokens); $i < $n; $i++)
// Setup the phrase positions.
$i2 = $i + 1;
$i3 = $i + 2;
// Create the two word phrase.
if ($i2 < $n && isset($tokens[$i2]))
// Tokenize the two word phrase.
$token = new FinderIndexerToken(array($tokens[$i]->term, $tokens[$i2]->term), $lang, $lang === 'zh' ? '' : ' ');
$token->derived = true;
// Add the token to the stack.
$tokens[] = $token;
// Create the three word phrase.
if ($i3 < $n && isset($tokens[$i3]))
// Tokenize the three word phrase.
$token = new FinderIndexerToken(array($tokens[$i]->term, $tokens[$i2]->term, $tokens[$i3]->term), $lang, $lang === 'zh' ? '' : ' ');
$token->derived = true;
// Add the token to the stack.
$tokens[] = $token;
if ($store)
$cache[$store] = count($tokens) > 1 ? $tokens : array_shift($tokens);
return $cache[$store];
return count($tokens) > 1 ? $tokens : array_shift($tokens);
* Method to get the base word of a token. This method uses the public
* {@link FinderIndexerHelper::$stemmer} object if it is set. If no stemmer is set,
* the original token is returned.
* @param string $token The token to stem.
* @param string $lang The language of the token.
* @return string The root token.
* @since 2.5
public static function stem($token, $lang)
// Trim apostrophes at either end of the token.
$token = JString::trim($token, '\'');
// Trim everything after any apostrophe in the token.
if (($pos = JString::strpos($token, '\'')) !== false)
$token = JString::substr($token, 0, $pos);
// Stem the token if we have a valid stemmer to use.
if (self::$stemmer instanceof FinderIndexerStemmer)
return self::$stemmer->stem($token, $lang);
return $token;
* Method to add a content type to the database.
* @param string $title The type of content. For example: PDF
* @param string $mime The mime type of the content. For example: PDF [optional]
* @return integer The id of the content type.
* @since 2.5
* @throws Exception on database error.
public static function addContentType($title, $mime = null)
static $types;
$db = JFactory::getDbo();
$query = $db->getQuery(true);
// Check if the types are loaded.
if (empty($types))
// Build the query to get the types.
// Get the types.
$types = $db->loadObjectList('title');
// Check if the type already exists.
if (isset($types[$title]))
return (int) $types[$title]->id;
// Add the type.
->columns(array($db->quoteName('title'), $db->quoteName('mime')))
->values($db->quote($title) . ', ' . $db->quote($mime));
// Return the new id.
return (int) $db->insertid();
* Method to check if a token is common in a language.
* @param string $token The token to test.
* @param string $lang The language to reference.
* @return boolean True if common, false otherwise.
* @since 2.5
public static function isCommon($token, $lang)
static $data;
// Load the common tokens for the language if necessary.
if (!isset($data[$lang]))
$data[$lang] = self::getCommonWords($lang);
// Check if the token is in the common array.
if (in_array($token, $data[$lang]))
return true;
return false;
* Method to get an array of common terms for a language.
* @param string $lang The language to use.
* @return array Array of common terms.
* @since 2.5
* @throws Exception on database error.
public static function getCommonWords($lang)
$db = JFactory::getDbo();
// Create the query to load all the common terms for the language.
$query = $db->getQuery(true)
->where($db->quoteName('language') . ' = ' . $db->quote($lang));
// Load all of the common terms for the language.
$results = $db->loadColumn();
return $results;
* Method to get the default language for the site.
* @return string The default language string.
* @since 2.5
public static function getDefaultLanguage()
static $lang;
// We need to go to com_languages to get the site default language, it's the best we can guess.
if (empty($lang))
$lang = JComponentHelper::getParams('com_languages')->get('site', 'en-GB');
return $lang;
* Method to parse a language/locale key and return a simple language string.
* @param string $lang The language/locale key. For example: en-GB
* @return string The simple language string. For example: en
* @since 2.5
public static function getPrimaryLanguage($lang)
static $data;
// Only parse the identifier if necessary.
if (!isset($data[$lang]))
if (is_callable(array('Locale', 'getPrimaryLanguage')))
// Get the language key using the Locale package.
$data[$lang] = Locale::getPrimaryLanguage($lang);
// Get the language key using string position.
$data[$lang] = JString::substr($lang, 0, JString::strpos($lang, '-'));
return $data[$lang];
* Method to get the path (SEF route) for a content item.
* @param string $url The non-SEF route to the content item.
* @return string The path for the content item.
* @since 2.5
public static function getContentPath($url)
static $router;
// Only get the router once.
if (!($router instanceof JRouter))
include_once JPATH_SITE . '/includes/application.php';
// Get and configure the site router.
$config = JFactory::getConfig();
$router = JRouter::getInstance('site');
$router->setMode($config->get('sef', 1));
// Build the relative route.
$uri = $router->build($url);
$route = $uri->toString(array('path', 'query', 'fragment'));
$route = str_replace(JUri::base(true) . '/', '', $route);
return $route;
* Method to get extra data for a content before being indexed. This is how
* we add Comments, Tags, Labels, etc. that should be available to Finder.
* @param FinderIndexerResult &$item The item to index as an FinderIndexerResult object.
* @return boolean True on success, false on failure.
* @since 2.5
* @throws Exception on database error.
public static function getContentExtras(FinderIndexerResult &$item)
// Get the event dispatcher.
$dispatcher = JEventDispatcher::getInstance();
// Load the finder plugin group.
// Trigger the event.
$results = $dispatcher->trigger('onPrepareFinderContent', array(&$item));
// Check the returned results. This is for plugins that don't throw
// exceptions when they encounter serious errors.
if (in_array(false, $results))
throw new Exception($dispatcher->getError(), 500);
catch (Exception $e)
// Handle a caught exception.
throw $e;
return true;
* Method to process content text using the onContentPrepare event trigger.
* @param string $text The content to process.
* @param JRegistry $params The parameters object. [optional]
* @return string The processed content.
* @since 2.5
public static function prepareContent($text, $params = null)
static $loaded;
// Get the dispatcher.
$dispatcher = JEventDispatcher::getInstance();
// Load the content plugins if necessary.
if (empty($loaded))
$loaded = true;
// Instantiate the parameter object if necessary.
if (!($params instanceof JRegistry))
$registry = new JRegistry;
$params = $registry;
// Create a mock content object.
$content = JTable::getInstance('Content');
$content->text = $text;
// Fire the onContentPrepare event.
$dispatcher->trigger('onContentPrepare', array('com_finder.indexer', &$content, &$params, 0));
return $content->text;