first commit

2020-01-02 22:20:31 +07:00
commit 10eb3340ad
5753 changed files with 631345 additions and 0 deletions
--- a/administrator/components/com_finder/helpers/indexer/helper.php
+++ b/administrator/components/com_finder/helpers/indexer/helper.php
@@ -0,0 +1,506 @@
+<?php
+/**
+ * @package     Joomla.Administrator
+ * @subpackage  com_finder
+ *
+ * @copyright   Copyright (C) 2005 - 2013 Open Source Matters, Inc. All rights reserved.
+ * @license     GNU General Public License version 2 or later; see LICENSE
+ */
+
+defined('_JEXEC') or die;
+
+JLoader::register('FinderIndexerParser', __DIR__ . '/parser.php');
+JLoader::register('FinderIndexerStemmer', __DIR__ . '/stemmer.php');
+JLoader::register('FinderIndexerToken', __DIR__ . '/token.php');
+
+/**
+ * Helper class for the Finder indexer package.
+ *
+ * @package     Joomla.Administrator
+ * @subpackage  com_finder
+ * @since       2.5
+ */
+class FinderIndexerHelper
+{
+	/**
+	 * The token stemmer object. The stemmer is set by whatever class
+	 * wishes to use it but it must be an instance of FinderIndexerStemmer.
+	 *
+	 * @var		FinderIndexerStemmer
+	 * @since	2.5
+	 */
+	public static $stemmer;
+
+	/**
+	 * Method to parse input into plain text.
+	 *
+	 * @param   string  $input   The raw input.
+	 * @param   string  $format  The format of the input. [optional]
+	 *
+	 * @return  string  The parsed input.
+	 *
+	 * @since   2.5
+	 * @throws  Exception on invalid parser.
+	 */
+	public static function parse($input, $format = 'html')
+	{
+		// Get a parser for the specified format and parse the input.
+		return FinderIndexerParser::getInstance($format)->parse($input);
+	}
+
+	/**
+	 * Method to tokenize a text string.
+	 *
+	 * @param   string   $input   The input to tokenize.
+	 * @param   string   $lang    The language of the input.
+	 * @param   boolean  $phrase  Flag to indicate whether input could be a phrase. [optional]
+	 *
+	 * @return  array  An array of FinderIndexerToken objects.
+	 *
+	 * @since   2.5
+	 */
+	public static function tokenize($input, $lang, $phrase = false)
+	{
+		static $cache;
+		$store = JString::strlen($input) < 128 ? md5($input . '::' . $lang . '::' . $phrase) : null;
+
+		// Check if the string has been tokenized already.
+		if ($store && isset($cache[$store]))
+		{
+			return $cache[$store];
+		}
+
+		$tokens = array();
+		$quotes = html_entity_decode('&#8216;&#8217;&#39;', ENT_QUOTES, 'UTF-8');
+
+		// Get the simple language key.
+		$lang = self::getPrimaryLanguage($lang);
+
+		/*
+		 * Parsing the string input into terms is a multi-step process.
+		 *
+		 * Regexes:
+		 *  1. Remove everything except letters, numbers, quotes, apostrophe, plus, dash, period, and comma.
+		 *  2. Remove plus, dash, period, and comma characters located before letter characters.
+		 *  3. Remove plus, dash, period, and comma characters located after other characters.
+		 *  4. Remove plus, period, and comma characters enclosed in alphabetical characters. Ungreedy.
+		 *  5. Remove orphaned apostrophe, plus, dash, period, and comma characters.
+		 *  6. Remove orphaned quote characters.
+		 *  7. Replace the assorted single quotation marks with the ASCII standard single quotation.
+		 *  8. Remove multiple space characters and replaces with a single space.
+		 */
+		$input = JString::strtolower($input);
+		$input = preg_replace('#[^\pL\pM\pN\p{Pi}\p{Pf}\'+-.,]+#mui', ' ', $input);
+		$input = preg_replace('#(^|\s)[+-.,]+([\pL\pM]+)#mui', ' $1', $input);
+		$input = preg_replace('#([\pL\pM\pN]+)[+-.,]+(\s|$)#mui', '$1 ', $input);
+		$input = preg_replace('#([\pL\pM]+)[+.,]+([\pL\pM]+)#muiU', '$1 $2', $input);
+		$input = preg_replace('#(^|\s)[\'+-.,]+(\s|$)#mui', ' ', $input);
+		$input = preg_replace('#(^|\s)[\p{Pi}\p{Pf}]+(\s|$)#mui', ' ', $input);
+		$input = preg_replace('#[' . $quotes . ']+#mui', '\'', $input);
+		$input = preg_replace('#\s+#mui', ' ', $input);
+		$input = JString::trim($input);
+
+		// Explode the normalized string to get the terms.
+		$terms = explode(' ', $input);
+
+		/*
+		 * If we have Unicode support and are dealing with Chinese text, Chinese
+		 * has to be handled specially because there are not necessarily any spaces
+		 * between the "words". So, we have to test if the words belong to the Chinese
+		 * character set and if so, explode them into single glyphs or "words".
+		 */
+		if ($lang === 'zh')
+		{
+			// Iterate through the terms and test if they contain Chinese.
+			for ($i = 0, $n = count($terms); $i < $n; $i++)
+			{
+				$charMatches = array();
+				$charCount = preg_match_all('#[\p{Han}]#mui', $terms[$i], $charMatches);
+
+				// Split apart any groups of Chinese characters.
+				for ($j = 0; $j < $charCount; $j++)
+				{
+					$tSplit = JString::str_ireplace($charMatches[0][$j], '', $terms[$i], false);
+					if (!empty($tSplit))
+					{
+						$terms[$i] = $tSplit;
+					}
+					else
+					{
+						unset($terms[$i]);
+					}
+
+					$terms[] = $charMatches[0][$j];
+				}
+			}
+
+			// Reset array keys.
+			$terms = array_values($terms);
+		}
+
+		/*
+		 * If we have to handle the input as a phrase, that means we don't
+		 * tokenize the individual terms and we do not create the two and three
+		 * term combinations. The phrase must contain more than one word!
+		 */
+		if ($phrase === true && count($terms) > 1)
+		{
+			// Create tokens from the phrase.
+			$tokens[] = new FinderIndexerToken($terms, $lang);
+		}
+		else
+		{
+			// Create tokens from the terms.
+			for ($i = 0, $n = count($terms); $i < $n; $i++)
+			{
+				$tokens[] = new FinderIndexerToken($terms[$i], $lang);
+			}
+
+			// Create two and three word phrase tokens from the individual words.
+			for ($i = 0, $n = count($tokens); $i < $n; $i++)
+			{
+				// Setup the phrase positions.
+				$i2 = $i + 1;
+				$i3 = $i + 2;
+
+				// Create the two word phrase.
+				if ($i2 < $n && isset($tokens[$i2]))
+				{
+					// Tokenize the two word phrase.
+					$token = new FinderIndexerToken(array($tokens[$i]->term, $tokens[$i2]->term), $lang, $lang === 'zh' ? '' : ' ');
+					$token->derived = true;
+
+					// Add the token to the stack.
+					$tokens[] = $token;
+				}
+
+				// Create the three word phrase.
+				if ($i3 < $n && isset($tokens[$i3]))
+				{
+					// Tokenize the three word phrase.
+					$token = new FinderIndexerToken(array($tokens[$i]->term, $tokens[$i2]->term, $tokens[$i3]->term), $lang, $lang === 'zh' ? '' : ' ');
+					$token->derived = true;
+
+					// Add the token to the stack.
+					$tokens[] = $token;
+				}
+			}
+		}
+
+		if ($store)
+		{
+			$cache[$store] = count($tokens) > 1 ? $tokens : array_shift($tokens);
+			return $cache[$store];
+		}
+		else
+		{
+			return count($tokens) > 1 ? $tokens : array_shift($tokens);
+		}
+	}
+
+	/**
+	 * Method to get the base word of a token. This method uses the public
+	 * {@link FinderIndexerHelper::$stemmer} object if it is set. If no stemmer is set,
+	 * the original token is returned.
+	 *
+	 * @param   string  $token  The token to stem.
+	 * @param   string  $lang   The language of the token.
+	 *
+	 * @return  string  The root token.
+	 *
+	 * @since   2.5
+	 */
+	public static function stem($token, $lang)
+	{
+		// Trim apostrophes at either end of the token.
+		$token = JString::trim($token, '\'');
+
+		// Trim everything after any apostrophe in the token.
+		if (($pos = JString::strpos($token, '\'')) !== false)
+		{
+			$token = JString::substr($token, 0, $pos);
+		}
+
+		// Stem the token if we have a valid stemmer to use.
+		if (self::$stemmer instanceof FinderIndexerStemmer)
+		{
+			return self::$stemmer->stem($token, $lang);
+		}
+		else
+		{
+			return $token;
+		}
+	}
+
+	/**
+	 * Method to add a content type to the database.
+	 *
+	 * @param   string  $title  The type of content. For example: PDF
+	 * @param   string  $mime   The mime type of the content. For example: PDF [optional]
+	 *
+	 * @return  integer  The id of the content type.
+	 *
+	 * @since   2.5
+	 * @throws  Exception on database error.
+	 */
+	public static function addContentType($title, $mime = null)
+	{
+		static $types;
+
+		$db = JFactory::getDbo();
+		$query = $db->getQuery(true);
+
+		// Check if the types are loaded.
+		if (empty($types))
+		{
+			// Build the query to get the types.
+			$query->select('*')
+				->from($db->quoteName('#__finder_types'));
+
+			// Get the types.
+			$db->setQuery($query);
+			$types = $db->loadObjectList('title');
+		}
+
+		// Check if the type already exists.
+		if (isset($types[$title]))
+		{
+			return (int) $types[$title]->id;
+		}
+
+		// Add the type.
+		$query->clear()
+			->insert($db->quoteName('#__finder_types'))
+			->columns(array($db->quoteName('title'), $db->quoteName('mime')))
+			->values($db->quote($title) . ', ' . $db->quote($mime));
+		$db->setQuery($query);
+		$db->execute();
+
+		// Return the new id.
+		return (int) $db->insertid();
+	}
+
+	/**
+	 * Method to check if a token is common in a language.
+	 *
+	 * @param   string  $token  The token to test.
+	 * @param   string  $lang   The language to reference.
+	 *
+	 * @return  boolean  True if common, false otherwise.
+	 *
+	 * @since   2.5
+	 */
+	public static function isCommon($token, $lang)
+	{
+		static $data;
+
+		// Load the common tokens for the language if necessary.
+		if (!isset($data[$lang]))
+		{
+			$data[$lang] = self::getCommonWords($lang);
+		}
+
+		// Check if the token is in the common array.
+		if (in_array($token, $data[$lang]))
+		{
+			return true;
+		}
+		else
+		{
+			return false;
+		}
+	}
+
+	/**
+	 * Method to get an array of common terms for a language.
+	 *
+	 * @param   string  $lang  The language to use.
+	 *
+	 * @return  array  Array of common terms.
+	 *
+	 * @since   2.5
+	 * @throws  Exception on database error.
+	 */
+	public static function getCommonWords($lang)
+	{
+		$db = JFactory::getDbo();
+
+		// Create the query to load all the common terms for the language.
+		$query = $db->getQuery(true)
+			->select($db->quoteName('term'))
+			->from($db->quoteName('#__finder_terms_common'))
+			->where($db->quoteName('language') . ' = ' . $db->quote($lang));
+
+		// Load all of the common terms for the language.
+		$db->setQuery($query);
+		$results = $db->loadColumn();
+
+		return $results;
+	}
+
+	/**
+	 * Method to get the default language for the site.
+	 *
+	 * @return  string  The default language string.
+	 *
+	 * @since   2.5
+	 */
+	public static function getDefaultLanguage()
+	{
+		static $lang;
+
+		// We need to go to com_languages to get the site default language, it's the best we can guess.
+		if (empty($lang))
+		{
+			$lang = JComponentHelper::getParams('com_languages')->get('site', 'en-GB');
+		}
+
+		return $lang;
+	}
+
+	/**
+	 * Method to parse a language/locale key and return a simple language string.
+	 *
+	 * @param   string  $lang  The language/locale key. For example: en-GB
+	 *
+	 * @return  string  The simple language string. For example: en
+	 *
+	 * @since   2.5
+	 */
+	public static function getPrimaryLanguage($lang)
+	{
+		static $data;
+
+		// Only parse the identifier if necessary.
+		if (!isset($data[$lang]))
+		{
+			if (is_callable(array('Locale', 'getPrimaryLanguage')))
+			{
+				// Get the language key using the Locale package.
+				$data[$lang] = Locale::getPrimaryLanguage($lang);
+			}
+			else
+			{
+				// Get the language key using string position.
+				$data[$lang] = JString::substr($lang, 0, JString::strpos($lang, '-'));
+			}
+		}
+
+		return $data[$lang];
+	}
+
+	/**
+	 * Method to get the path (SEF route) for a content item.
+	 *
+	 * @param   string  $url  The non-SEF route to the content item.
+	 *
+	 * @return  string  The path for the content item.
+	 *
+	 * @since   2.5
+	 */
+	public static function getContentPath($url)
+	{
+		static $router;
+
+		// Only get the router once.
+		if (!($router instanceof JRouter))
+		{
+			jimport('joomla.application.router');
+			include_once JPATH_SITE . '/includes/application.php';
+
+			// Get and configure the site router.
+			$config = JFactory::getConfig();
+			$router = JRouter::getInstance('site');
+			$router->setMode($config->get('sef', 1));
+		}
+
+		// Build the relative route.
+		$uri = $router->build($url);
+		$route = $uri->toString(array('path', 'query', 'fragment'));
+		$route = str_replace(JUri::base(true) . '/', '', $route);
+
+		return $route;
+	}
+
+	/**
+	 * Method to get extra data for a content before being indexed. This is how
+	 * we add Comments, Tags, Labels, etc. that should be available to Finder.
+	 *
+	 * @param   FinderIndexerResult  &$item  The item to index as an FinderIndexerResult object.
+	 *
+	 * @return  boolean  True on success, false on failure.
+	 *
+	 * @since   2.5
+	 * @throws  Exception on database error.
+	 */
+	public static function getContentExtras(FinderIndexerResult &$item)
+	{
+		// Get the event dispatcher.
+		$dispatcher = JEventDispatcher::getInstance();
+
+		// Load the finder plugin group.
+		JPluginHelper::importPlugin('finder');
+
+		try
+		{
+			// Trigger the event.
+			$results = $dispatcher->trigger('onPrepareFinderContent', array(&$item));
+
+			// Check the returned results. This is for plugins that don't throw
+			// exceptions when they encounter serious errors.
+			if (in_array(false, $results))
+			{
+				throw new Exception($dispatcher->getError(), 500);
+			}
+		}
+		catch (Exception $e)
+		{
+			// Handle a caught exception.
+			throw $e;
+		}
+
+		return true;
+	}
+
+	/**
+	 * Method to process content text using the onContentPrepare event trigger.
+	 *
+	 * @param   string     $text    The content to process.
+	 * @param   JRegistry  $params  The parameters object. [optional]
+	 *
+	 * @return  string  The processed content.
+	 *
+	 * @since   2.5
+	 */
+	public static function prepareContent($text, $params = null)
+	{
+		static $loaded;
+
+		// Get the dispatcher.
+		$dispatcher = JEventDispatcher::getInstance();
+
+		// Load the content plugins if necessary.
+		if (empty($loaded))
+		{
+			JPluginHelper::importPlugin('content');
+			$loaded = true;
+		}
+
+		// Instantiate the parameter object if necessary.
+		if (!($params instanceof JRegistry))
+		{
+			$registry = new JRegistry;
+			$registry->loadString($params);
+			$params = $registry;
+		}
+
+		// Create a mock content object.
+		$content = JTable::getInstance('Content');
+		$content->text = $text;
+
+		// Fire the onContentPrepare event.
+		$dispatcher->trigger('onContentPrepare', array('com_finder.indexer', &$content, &$params, 0));
+
+		return $content->text;
+	}
+}