You've already forked joomla_test
first commit
This commit is contained in:
506
administrator/components/com_finder/helpers/indexer/helper.php
Normal file
506
administrator/components/com_finder/helpers/indexer/helper.php
Normal file
@ -0,0 +1,506 @@
|
||||
<?php
|
||||
/**
|
||||
* @package Joomla.Administrator
|
||||
* @subpackage com_finder
|
||||
*
|
||||
* @copyright Copyright (C) 2005 - 2013 Open Source Matters, Inc. All rights reserved.
|
||||
* @license GNU General Public License version 2 or later; see LICENSE
|
||||
*/
|
||||
|
||||
defined('_JEXEC') or die;
|
||||
|
||||
JLoader::register('FinderIndexerParser', __DIR__ . '/parser.php');
|
||||
JLoader::register('FinderIndexerStemmer', __DIR__ . '/stemmer.php');
|
||||
JLoader::register('FinderIndexerToken', __DIR__ . '/token.php');
|
||||
|
||||
/**
|
||||
* Helper class for the Finder indexer package.
|
||||
*
|
||||
* @package Joomla.Administrator
|
||||
* @subpackage com_finder
|
||||
* @since 2.5
|
||||
*/
|
||||
class FinderIndexerHelper
|
||||
{
|
||||
/**
|
||||
* The token stemmer object. The stemmer is set by whatever class
|
||||
* wishes to use it but it must be an instance of FinderIndexerStemmer.
|
||||
*
|
||||
* @var FinderIndexerStemmer
|
||||
* @since 2.5
|
||||
*/
|
||||
public static $stemmer;
|
||||
|
||||
/**
|
||||
* Method to parse input into plain text.
|
||||
*
|
||||
* @param string $input The raw input.
|
||||
* @param string $format The format of the input. [optional]
|
||||
*
|
||||
* @return string The parsed input.
|
||||
*
|
||||
* @since 2.5
|
||||
* @throws Exception on invalid parser.
|
||||
*/
|
||||
public static function parse($input, $format = 'html')
|
||||
{
|
||||
// Get a parser for the specified format and parse the input.
|
||||
return FinderIndexerParser::getInstance($format)->parse($input);
|
||||
}
|
||||
|
||||
/**
|
||||
* Method to tokenize a text string.
|
||||
*
|
||||
* @param string $input The input to tokenize.
|
||||
* @param string $lang The language of the input.
|
||||
* @param boolean $phrase Flag to indicate whether input could be a phrase. [optional]
|
||||
*
|
||||
* @return array An array of FinderIndexerToken objects.
|
||||
*
|
||||
* @since 2.5
|
||||
*/
|
||||
public static function tokenize($input, $lang, $phrase = false)
|
||||
{
|
||||
static $cache;
|
||||
$store = JString::strlen($input) < 128 ? md5($input . '::' . $lang . '::' . $phrase) : null;
|
||||
|
||||
// Check if the string has been tokenized already.
|
||||
if ($store && isset($cache[$store]))
|
||||
{
|
||||
return $cache[$store];
|
||||
}
|
||||
|
||||
$tokens = array();
|
||||
$quotes = html_entity_decode('‘’'', ENT_QUOTES, 'UTF-8');
|
||||
|
||||
// Get the simple language key.
|
||||
$lang = self::getPrimaryLanguage($lang);
|
||||
|
||||
/*
|
||||
* Parsing the string input into terms is a multi-step process.
|
||||
*
|
||||
* Regexes:
|
||||
* 1. Remove everything except letters, numbers, quotes, apostrophe, plus, dash, period, and comma.
|
||||
* 2. Remove plus, dash, period, and comma characters located before letter characters.
|
||||
* 3. Remove plus, dash, period, and comma characters located after other characters.
|
||||
* 4. Remove plus, period, and comma characters enclosed in alphabetical characters. Ungreedy.
|
||||
* 5. Remove orphaned apostrophe, plus, dash, period, and comma characters.
|
||||
* 6. Remove orphaned quote characters.
|
||||
* 7. Replace the assorted single quotation marks with the ASCII standard single quotation.
|
||||
* 8. Remove multiple space characters and replaces with a single space.
|
||||
*/
|
||||
$input = JString::strtolower($input);
|
||||
$input = preg_replace('#[^\pL\pM\pN\p{Pi}\p{Pf}\'+-.,]+#mui', ' ', $input);
|
||||
$input = preg_replace('#(^|\s)[+-.,]+([\pL\pM]+)#mui', ' $1', $input);
|
||||
$input = preg_replace('#([\pL\pM\pN]+)[+-.,]+(\s|$)#mui', '$1 ', $input);
|
||||
$input = preg_replace('#([\pL\pM]+)[+.,]+([\pL\pM]+)#muiU', '$1 $2', $input);
|
||||
$input = preg_replace('#(^|\s)[\'+-.,]+(\s|$)#mui', ' ', $input);
|
||||
$input = preg_replace('#(^|\s)[\p{Pi}\p{Pf}]+(\s|$)#mui', ' ', $input);
|
||||
$input = preg_replace('#[' . $quotes . ']+#mui', '\'', $input);
|
||||
$input = preg_replace('#\s+#mui', ' ', $input);
|
||||
$input = JString::trim($input);
|
||||
|
||||
// Explode the normalized string to get the terms.
|
||||
$terms = explode(' ', $input);
|
||||
|
||||
/*
|
||||
* If we have Unicode support and are dealing with Chinese text, Chinese
|
||||
* has to be handled specially because there are not necessarily any spaces
|
||||
* between the "words". So, we have to test if the words belong to the Chinese
|
||||
* character set and if so, explode them into single glyphs or "words".
|
||||
*/
|
||||
if ($lang === 'zh')
|
||||
{
|
||||
// Iterate through the terms and test if they contain Chinese.
|
||||
for ($i = 0, $n = count($terms); $i < $n; $i++)
|
||||
{
|
||||
$charMatches = array();
|
||||
$charCount = preg_match_all('#[\p{Han}]#mui', $terms[$i], $charMatches);
|
||||
|
||||
// Split apart any groups of Chinese characters.
|
||||
for ($j = 0; $j < $charCount; $j++)
|
||||
{
|
||||
$tSplit = JString::str_ireplace($charMatches[0][$j], '', $terms[$i], false);
|
||||
if (!empty($tSplit))
|
||||
{
|
||||
$terms[$i] = $tSplit;
|
||||
}
|
||||
else
|
||||
{
|
||||
unset($terms[$i]);
|
||||
}
|
||||
|
||||
$terms[] = $charMatches[0][$j];
|
||||
}
|
||||
}
|
||||
|
||||
// Reset array keys.
|
||||
$terms = array_values($terms);
|
||||
}
|
||||
|
||||
/*
|
||||
* If we have to handle the input as a phrase, that means we don't
|
||||
* tokenize the individual terms and we do not create the two and three
|
||||
* term combinations. The phrase must contain more than one word!
|
||||
*/
|
||||
if ($phrase === true && count($terms) > 1)
|
||||
{
|
||||
// Create tokens from the phrase.
|
||||
$tokens[] = new FinderIndexerToken($terms, $lang);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Create tokens from the terms.
|
||||
for ($i = 0, $n = count($terms); $i < $n; $i++)
|
||||
{
|
||||
$tokens[] = new FinderIndexerToken($terms[$i], $lang);
|
||||
}
|
||||
|
||||
// Create two and three word phrase tokens from the individual words.
|
||||
for ($i = 0, $n = count($tokens); $i < $n; $i++)
|
||||
{
|
||||
// Setup the phrase positions.
|
||||
$i2 = $i + 1;
|
||||
$i3 = $i + 2;
|
||||
|
||||
// Create the two word phrase.
|
||||
if ($i2 < $n && isset($tokens[$i2]))
|
||||
{
|
||||
// Tokenize the two word phrase.
|
||||
$token = new FinderIndexerToken(array($tokens[$i]->term, $tokens[$i2]->term), $lang, $lang === 'zh' ? '' : ' ');
|
||||
$token->derived = true;
|
||||
|
||||
// Add the token to the stack.
|
||||
$tokens[] = $token;
|
||||
}
|
||||
|
||||
// Create the three word phrase.
|
||||
if ($i3 < $n && isset($tokens[$i3]))
|
||||
{
|
||||
// Tokenize the three word phrase.
|
||||
$token = new FinderIndexerToken(array($tokens[$i]->term, $tokens[$i2]->term, $tokens[$i3]->term), $lang, $lang === 'zh' ? '' : ' ');
|
||||
$token->derived = true;
|
||||
|
||||
// Add the token to the stack.
|
||||
$tokens[] = $token;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ($store)
|
||||
{
|
||||
$cache[$store] = count($tokens) > 1 ? $tokens : array_shift($tokens);
|
||||
return $cache[$store];
|
||||
}
|
||||
else
|
||||
{
|
||||
return count($tokens) > 1 ? $tokens : array_shift($tokens);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Method to get the base word of a token. This method uses the public
|
||||
* {@link FinderIndexerHelper::$stemmer} object if it is set. If no stemmer is set,
|
||||
* the original token is returned.
|
||||
*
|
||||
* @param string $token The token to stem.
|
||||
* @param string $lang The language of the token.
|
||||
*
|
||||
* @return string The root token.
|
||||
*
|
||||
* @since 2.5
|
||||
*/
|
||||
public static function stem($token, $lang)
|
||||
{
|
||||
// Trim apostrophes at either end of the token.
|
||||
$token = JString::trim($token, '\'');
|
||||
|
||||
// Trim everything after any apostrophe in the token.
|
||||
if (($pos = JString::strpos($token, '\'')) !== false)
|
||||
{
|
||||
$token = JString::substr($token, 0, $pos);
|
||||
}
|
||||
|
||||
// Stem the token if we have a valid stemmer to use.
|
||||
if (self::$stemmer instanceof FinderIndexerStemmer)
|
||||
{
|
||||
return self::$stemmer->stem($token, $lang);
|
||||
}
|
||||
else
|
||||
{
|
||||
return $token;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Method to add a content type to the database.
|
||||
*
|
||||
* @param string $title The type of content. For example: PDF
|
||||
* @param string $mime The mime type of the content. For example: PDF [optional]
|
||||
*
|
||||
* @return integer The id of the content type.
|
||||
*
|
||||
* @since 2.5
|
||||
* @throws Exception on database error.
|
||||
*/
|
||||
public static function addContentType($title, $mime = null)
|
||||
{
|
||||
static $types;
|
||||
|
||||
$db = JFactory::getDbo();
|
||||
$query = $db->getQuery(true);
|
||||
|
||||
// Check if the types are loaded.
|
||||
if (empty($types))
|
||||
{
|
||||
// Build the query to get the types.
|
||||
$query->select('*')
|
||||
->from($db->quoteName('#__finder_types'));
|
||||
|
||||
// Get the types.
|
||||
$db->setQuery($query);
|
||||
$types = $db->loadObjectList('title');
|
||||
}
|
||||
|
||||
// Check if the type already exists.
|
||||
if (isset($types[$title]))
|
||||
{
|
||||
return (int) $types[$title]->id;
|
||||
}
|
||||
|
||||
// Add the type.
|
||||
$query->clear()
|
||||
->insert($db->quoteName('#__finder_types'))
|
||||
->columns(array($db->quoteName('title'), $db->quoteName('mime')))
|
||||
->values($db->quote($title) . ', ' . $db->quote($mime));
|
||||
$db->setQuery($query);
|
||||
$db->execute();
|
||||
|
||||
// Return the new id.
|
||||
return (int) $db->insertid();
|
||||
}
|
||||
|
||||
/**
|
||||
* Method to check if a token is common in a language.
|
||||
*
|
||||
* @param string $token The token to test.
|
||||
* @param string $lang The language to reference.
|
||||
*
|
||||
* @return boolean True if common, false otherwise.
|
||||
*
|
||||
* @since 2.5
|
||||
*/
|
||||
public static function isCommon($token, $lang)
|
||||
{
|
||||
static $data;
|
||||
|
||||
// Load the common tokens for the language if necessary.
|
||||
if (!isset($data[$lang]))
|
||||
{
|
||||
$data[$lang] = self::getCommonWords($lang);
|
||||
}
|
||||
|
||||
// Check if the token is in the common array.
|
||||
if (in_array($token, $data[$lang]))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
else
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Method to get an array of common terms for a language.
|
||||
*
|
||||
* @param string $lang The language to use.
|
||||
*
|
||||
* @return array Array of common terms.
|
||||
*
|
||||
* @since 2.5
|
||||
* @throws Exception on database error.
|
||||
*/
|
||||
public static function getCommonWords($lang)
|
||||
{
|
||||
$db = JFactory::getDbo();
|
||||
|
||||
// Create the query to load all the common terms for the language.
|
||||
$query = $db->getQuery(true)
|
||||
->select($db->quoteName('term'))
|
||||
->from($db->quoteName('#__finder_terms_common'))
|
||||
->where($db->quoteName('language') . ' = ' . $db->quote($lang));
|
||||
|
||||
// Load all of the common terms for the language.
|
||||
$db->setQuery($query);
|
||||
$results = $db->loadColumn();
|
||||
|
||||
return $results;
|
||||
}
|
||||
|
||||
/**
|
||||
* Method to get the default language for the site.
|
||||
*
|
||||
* @return string The default language string.
|
||||
*
|
||||
* @since 2.5
|
||||
*/
|
||||
public static function getDefaultLanguage()
|
||||
{
|
||||
static $lang;
|
||||
|
||||
// We need to go to com_languages to get the site default language, it's the best we can guess.
|
||||
if (empty($lang))
|
||||
{
|
||||
$lang = JComponentHelper::getParams('com_languages')->get('site', 'en-GB');
|
||||
}
|
||||
|
||||
return $lang;
|
||||
}
|
||||
|
||||
/**
|
||||
* Method to parse a language/locale key and return a simple language string.
|
||||
*
|
||||
* @param string $lang The language/locale key. For example: en-GB
|
||||
*
|
||||
* @return string The simple language string. For example: en
|
||||
*
|
||||
* @since 2.5
|
||||
*/
|
||||
public static function getPrimaryLanguage($lang)
|
||||
{
|
||||
static $data;
|
||||
|
||||
// Only parse the identifier if necessary.
|
||||
if (!isset($data[$lang]))
|
||||
{
|
||||
if (is_callable(array('Locale', 'getPrimaryLanguage')))
|
||||
{
|
||||
// Get the language key using the Locale package.
|
||||
$data[$lang] = Locale::getPrimaryLanguage($lang);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Get the language key using string position.
|
||||
$data[$lang] = JString::substr($lang, 0, JString::strpos($lang, '-'));
|
||||
}
|
||||
}
|
||||
|
||||
return $data[$lang];
|
||||
}
|
||||
|
||||
/**
|
||||
* Method to get the path (SEF route) for a content item.
|
||||
*
|
||||
* @param string $url The non-SEF route to the content item.
|
||||
*
|
||||
* @return string The path for the content item.
|
||||
*
|
||||
* @since 2.5
|
||||
*/
|
||||
public static function getContentPath($url)
|
||||
{
|
||||
static $router;
|
||||
|
||||
// Only get the router once.
|
||||
if (!($router instanceof JRouter))
|
||||
{
|
||||
jimport('joomla.application.router');
|
||||
include_once JPATH_SITE . '/includes/application.php';
|
||||
|
||||
// Get and configure the site router.
|
||||
$config = JFactory::getConfig();
|
||||
$router = JRouter::getInstance('site');
|
||||
$router->setMode($config->get('sef', 1));
|
||||
}
|
||||
|
||||
// Build the relative route.
|
||||
$uri = $router->build($url);
|
||||
$route = $uri->toString(array('path', 'query', 'fragment'));
|
||||
$route = str_replace(JUri::base(true) . '/', '', $route);
|
||||
|
||||
return $route;
|
||||
}
|
||||
|
||||
/**
|
||||
* Method to get extra data for a content before being indexed. This is how
|
||||
* we add Comments, Tags, Labels, etc. that should be available to Finder.
|
||||
*
|
||||
* @param FinderIndexerResult &$item The item to index as an FinderIndexerResult object.
|
||||
*
|
||||
* @return boolean True on success, false on failure.
|
||||
*
|
||||
* @since 2.5
|
||||
* @throws Exception on database error.
|
||||
*/
|
||||
public static function getContentExtras(FinderIndexerResult &$item)
|
||||
{
|
||||
// Get the event dispatcher.
|
||||
$dispatcher = JEventDispatcher::getInstance();
|
||||
|
||||
// Load the finder plugin group.
|
||||
JPluginHelper::importPlugin('finder');
|
||||
|
||||
try
|
||||
{
|
||||
// Trigger the event.
|
||||
$results = $dispatcher->trigger('onPrepareFinderContent', array(&$item));
|
||||
|
||||
// Check the returned results. This is for plugins that don't throw
|
||||
// exceptions when they encounter serious errors.
|
||||
if (in_array(false, $results))
|
||||
{
|
||||
throw new Exception($dispatcher->getError(), 500);
|
||||
}
|
||||
}
|
||||
catch (Exception $e)
|
||||
{
|
||||
// Handle a caught exception.
|
||||
throw $e;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Method to process content text using the onContentPrepare event trigger.
|
||||
*
|
||||
* @param string $text The content to process.
|
||||
* @param JRegistry $params The parameters object. [optional]
|
||||
*
|
||||
* @return string The processed content.
|
||||
*
|
||||
* @since 2.5
|
||||
*/
|
||||
public static function prepareContent($text, $params = null)
|
||||
{
|
||||
static $loaded;
|
||||
|
||||
// Get the dispatcher.
|
||||
$dispatcher = JEventDispatcher::getInstance();
|
||||
|
||||
// Load the content plugins if necessary.
|
||||
if (empty($loaded))
|
||||
{
|
||||
JPluginHelper::importPlugin('content');
|
||||
$loaded = true;
|
||||
}
|
||||
|
||||
// Instantiate the parameter object if necessary.
|
||||
if (!($params instanceof JRegistry))
|
||||
{
|
||||
$registry = new JRegistry;
|
||||
$registry->loadString($params);
|
||||
$params = $registry;
|
||||
}
|
||||
|
||||
// Create a mock content object.
|
||||
$content = JTable::getInstance('Content');
|
||||
$content->text = $text;
|
||||
|
||||
// Fire the onContentPrepare event.
|
||||
$dispatcher->trigger('onContentPrepare', array('com_finder.indexer', &$content, &$params, 0));
|
||||
|
||||
return $content->text;
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user