Your IP : 10.10.0.253


Current Path : /var/www/libraries/foundry/libraries/
Upload File :
Current File : /var/www/libraries/foundry/libraries/scraper.php

<?php
/**
* @package		Foundry
* @copyright	Copyright (C) Stack Ideas Sdn Bhd. All rights reserved.
* @license		GNU/GPL, see LICENSE.php
* Foundry is free software. This version may have been modified pursuant
* to the GNU General Public License, and as distributed it includes or
* is derivative of works licensed under the GNU General Public License or
* other free or open source software licenses.
* See COPYRIGHT.php for copyright notices and details.
*/
namespace Foundry\Libraries;

defined('_JEXEC') or die('Unauthorized Access');

use Foundry\Libraries\SimpleHtml;
use Foundry\Helpers\StringHelper;

require_once(__DIR__ . '/scraper/oembed/adapter.php');
require_once(__DIR__ . '/scraper/plugins/plugin.php');

class Scraper
{
	private $hooks = [];
	private $oembeds = [];

	private $contents = null;
	private $url = '';

	private $plugins = [
		'title',
		'description',
		'keywords',
		'opengraph',
		'images'
	];

	public function __construct($url)
	{
		$this->url = $this->normalizeUrl($url);
		$this->oembeds = $this->getOembedAdapters();
	}

	/**
	 * Extract the content type from the header
	 *
	 * @since	1.0.0
	 * @access	public
	 */
	private function extractContentType($headers)
	{
		preg_match_all("/Content-Type: (\w+\/\w+)/i", $headers, $matches);

		if (isset($matches[1][0])) {
			return $matches[1][0];
		}

		return false;
	}

	/**
	 * Determines the type of the response from the header
	 *
	 * @since	1.0.0
	 * @access	public
	 */
	public function getContentType($header)
	{
		$mime = $this->extractContentType($header);

		if (in_array($mime, ['image/jpeg', 'image/png', 'image/gif'])) {
			return 'image';
		}

		// Treat any text based as html
		if ($mime === 'text/html' || $mime === 'application/json') {
			return 'html';
		}

		return false;
	}

	/**
	 * Creates a new plugin instance
	 *
	 * @since	1.0.0
	 * @access	public
	 */
	public function getPlugin($plugin, $parser, $contents)
	{
		require_once(__DIR__ . '/scraper/plugins/' . $plugin . '.php');

		$pluginClass = 'ScraperPlugin' . ucfirst($plugin);
		$plugin = new $pluginClass($parser, $contents, $this->url);

		return $plugin;
	}

	/**
	 * Create an instance of the scraper adapter 
	 *
	 * @since	1.0.0
	 * @access	public
	 */
	public function getOembedAdapter($type)
	{
		$file = __DIR__ . '/scraper/oembed/' . $type . '.php';
		$exists = file_exists($file);

		if (!$exists) {
			return false;
		}

		require_once($file);

		$adapterClassName = 'OembedAdapter' . ucfirst($type);
		$adapter = new $adapterClassName($this->url);

		return $adapter;
	}

	/**
	 * Create an instance of the scraper adapter 
	 *
	 * @since	1.0.0
	 * @access	public
	 */
	public function getOembedAdapterForUrl($url)
	{
		foreach ($this->oembeds as $adapter) {
			$handler = $adapter->isHandler($url);

			if ($handler !== false) {
				return $adapter;
			}
		}

		return false;
	}

	/**
	 * Initialize adapters available for the scraper
	 *
	 * @since	1.0.0
	 * @access	public
	 */
	public function getOembedAdapters()
	{
		$files = \JFolder::files(__DIR__ . '/scraper/oembed', '.php', false, false, array('.svn', 'CVS', '.DS_Store', '__MACOSX', 'adapter.php'));

		if (!$files) {
			return false;
		}

		$adapters = [];

		foreach ($files as $file) {
			$adapters[] = $this->getOembedAdapter(str_ireplace('.php', '', $file));
		}

		return $adapters;
	}

	/**
	 * Determines if the response header determines if this is a html document
	 *
	 * @since	1.0.0
	 * @access	public
	 */
	public function isImage($headers)
	{
		$type = $this->getContentType($headers);

		return $type === 'image';
	}

	/**
	 * Determines if the response header determines if this is a html document
	 *
	 * @since	1.0.0
	 * @access	public
	 */
	public function isHtmlDocument($headers)
	{
		$type = $this->getContentType($headers);

		return $type === 'html';
	}

	/**
	 * Normalizes the output of a url
	 *
	 * @since	1.0.0
	 * @access	public
	 */
	public function normalizeContent($url, $content)
	{
		$info = parse_url($url);

		// This will ensure that urls that doesn't contain a scheme, will be prefixed with the correct scheme. E.g: //some/image.png
		$content = str_ireplace('src="//', 'src="' . $info['scheme'] . '://' , $content);

		return $content;
	}

	/**
	 * Normalizes the url and ensure that it is a valid url
	 *
	 * @since	1.0.0
	 * @access	public
	 */
	public function normalizeUrl($url)
	{
		$url = trim($url);

		if (stristr($url, 'http://') === false && stristr($url, 'https://') === false) {
			$url = 'http://' . $url;
		}

		return $url;
	}

	/**
	 * Normalizes the result object to ensure that all our data is standardized
	 *
	 * @since	1.0.0
	 * @access	public
	 */
	public function normalizeResult(&$result)
	{
		if (!isset($result->images)) {
			$result->images = array();
		}

		// If there is an oembed title, we should use it instead
		if (isset($result->oembed->title)) {
			$result->title = $result->oembed->title;
		}

		// We should rely on the opengraph title if there is
		if (isset($result->opengraph->title)) {
			$result->title = $result->opengraph->title;
		}

		if (isset($result->opengraph->desc)) {
			$result->description = $result->opengraph->desc;
		}

		if (isset($result->oembed->description)) {
			$result->description = $result->oembed->description;
		}

		// Normalize the properties
		$result->title = isset($result->title) ? $result->title : $result->url;

		// If the oembed has a thumbnail, we should always use it as the first image
		if (isset($result->oembed->thumbnail)) {
			array_unshift($result->images, $result->oembed->thumbnail);
		}

		// If the page has opengraph data
		if (isset($result->opengraph->image)) {
			array_unshift($result->images, $result->opengraph->image);
		}

		// If opengraph has video
		if (isset($result->opengraph->video)) {
			$result->video = $result->opengraph->video;
		}

		if (!isset($result->video)) {
			$result->video = false;
		}
	}

	/**
	 * Rescrapes a url
	 *
	 * @since	1.0.0
	 * @access	public
	 */
	public function rescrape($url)
	{
		$scraper = new Scraper($url);
		return $scraper->scrape();
	}

	/**
	 * Scrapes url and retrieves the content from the particular page
	 *
	 * @since	1.0.0
	 * @access	public
	 */
	public function scrape()
	{
		// Get the appropriate oembed adapter for this url
		$oembed = $this->getOembedAdapterForUrl($this->url);

		if (!$oembed) {
			$oembed = new \OembedAdapter($this->url);
		}

		$result = $oembed->scrape();

		// Oembed adapter implements its own scrape method, so we do not need to proceed
		if ($result !== true) {
			return $result;
		}

		$connector = \FH::connector($this->url);
		$contents = $connector->setUserAgent('Facebot')->execute()->getResult();

		// Get the response headers so that we can determine the response type
		$headers = $connector->getResponseHeaders();

		// For non html result, we should not process anything
		if (!$this->isHtmlDocument($headers)) {
			$result = (object) [
				'url' => $this->url,
				'description' => $this->url,
				'images' => $this->isImage($headers) ? $this->url : []
			];

			// Format the result
			$this->normalizeResult($result);

			return $result;
		}

		// Normalize the contents
		$contents = $this->normalizeContent($this->url, $contents);

		// Make sure the content is utf-8 as SocialSimpleHTML can only support UTF-8
		// $contents = mb_convert_encoding($contents, "UTF-8");
		if (!mb_detect_encoding($contents, 'UTF-8', true)) {
			$charset = 'utf-8';

			preg_match_all("/charset=([^()<>@,;:\"\/[\]?.=\s]*)/i", $header, $matches);

			if ($matches && isset($matches[1]) && $matches[1]) {
				$charset = $matches[1][0];
			}

			if ($charset === 'windows-1251') {
				$contents = mb_convert_encoding($contents, "utf-8", "windows-1251");
			}

			if ($charset !== 'windows-1251') {
				$contents = StringHelper::forceUTF8($contents);
			}
		}

		// Get the parser
		$this->parser = SimpleHtml::str_get_html($contents);

		// If we cannot parse the html, we shouldn't try to do anything
		if (!$this->parser) {
			return false;
		}

		$oembed->setParser($this->parser);
		$oembed->setContents($contents);

		// When scraping an amp page, we need to find the correct url
		$amp = $this->parser->find('html[amp]');

		if ($amp && isset($amp[0])) {
			$canonical = $this->parser->find('link[rel=canonical]');

			if ($canonical && isset($canonical[0])) {
				$url = $canonical[0]->href;

				return $this->rescrape($url);
			}
		}

		// When there are redirections, we also need to handle the correct url
		$httpEquiv = $this->parser->find('meta[http-equiv=refresh]');

		if ($httpEquiv && isset($httpEquiv[0])) {
			$httpEquiv = $httpEquiv[0]->attr['content'];

			// Check if this refresh value has url in it.
			$pattern = '/url=["\'](.*)["\']/i';
			preg_match($pattern, $httpEquiv, $matches);

			if (!empty($matches)) {
				return $this->rescrape($matches[1]);
			}
		}

		$result = (object) [
			'oembed' => $oembed->getOembedData($this->parser->find('link[type=application/json+oembed]'))
		];

		foreach ($this->plugins as $plugin) {
			$plugin = $this->getPlugin($plugin, $this->parser, $contents);

			$plugin->process($result);
		}

		if (method_exists($oembed, 'process')) {
			$oembed->process($result);
		}

		// Ensure that we have a standardized result
		$this->normalizeResult($result);

		return $result;
	}
}