File: /home/imensosw/www/mpl.imenso.co/app/WebsiteParser.php
<?php
/**
* A Website parser class.
*
* Grab website contents and extracts all hyper links and image sources
*
* @author Morshed Alam <morshed201@gmail.com>
* @link http://www.scripts.morshed-alam.com/scrapping/
* @website http://morshed-alam.com
*/
namespace App;
class WebsiteParser
{
/**
* Links type
* String type flag better understandable.
*/
const LINK_TYPE_ALL = 'all';
const LINK_TYPE_INTERNAL = 'internal';
const LINK_TYPE_EXTERNAL = 'external';
const LINK_TYPE_UNKNOWN = 'unknown';
/**
* Link type.
* @var int
*/
protected $link_type = self::LINK_TYPE_ALL;
/**
* The target website url to parse.
* @var string
*/
public $target_url = '';
/**
* Base Url from target website.
* @var string
*/
public $base_url = '';
/**
* Full website Url.
* @var string
*/
public $absolute_url = '';
/**
* Only domain name.
* @var string
*/
protected $domain = '';
/**
* Grabbed html content from target website.
* @var text
*/
public $content = null;
/**
* Hyper links.
* @var array
*/
public $href_links = [];
/**
* Image sources.
* @var array
*/
public $image_sources = [];
/**
* Regular expression.
* @full_link_pattern To match urls containing protocol
* @href_filter_pattern Filter out invalid hyper links
* @href_expression Extract hyper links
* @img_expression Extract image sources
*/
public $full_link_pattern = '/\/\/|www\.|mailto:/';
private $href_filter_pattern = '/\<|#|javascript:void/';
private $href_expression = '/\<a\s[^>]*href\s*=\s*\"([^\"]*)\"[^>]*>(.*?)<\/a>/';
private $img_expression = '/<img[^>]+src=([\'"])?((?(1).+?|[^\s>]+))(?(1)\1)/';
private $external_link_pattern = "/^(https?:){0,1}\/\/(www\.){0,1}(.*)/i";
private $internal_link_pattern = "/^(https?:){0,1}\/\/(www\.){0,1}#domain#/i";
private $title_expression = "/<title>(.*)<\/title>/";
// metatags are normaly this form: <meta name="NAME" content="CONTENT" />
// Facebook use property "instead" of "name", see here : http://ogp.me/
private $metatags_expression = '/<meta[^>]+(?:name|property)="([^"]*)"[^>]+content="([^"]*)"[^>]*>/';
/**
* cUrl option.
* @var array
*/
private $curl_options = [
CURLOPT_RETURNTRANSFER => true, // return web page
CURLOPT_HEADER => false, // don't return headers
CURLOPT_FOLLOWLOCATION => true, // follow redirects
CURLOPT_ENCODING => '', // handle all encodings
CURLOPT_USERAGENT => 'spider', // who am i
CURLOPT_AUTOREFERER => true, // set referrer on redirect
CURLOPT_CONNECTTIMEOUT => 60, // timeout on connect
CURLOPT_TIMEOUT => 120, // timeout on response
CURLOPT_MAXREDIRS => 5, // stop after 10 redirects
CURLOPT_SSL_VERIFYPEER => false,
CURLOPT_SSL_VERIFYHOST => false,
];
/**
* Message of WebsiteParser.
* @var string
*/
public $message = '';
/**
* Class constructor.
* @param string $url Target Url to parse
* @param string $link_type Link type to grab
*/
public function __construct($url, $link_type = 'all')
{
$this->_isCurl();
$this->target_url = $url;
$this->setUrls();
$this->setLinksType($link_type);
}
/**
* The class destructor.
*
* Explicitly clears Parser object from memory upon destruction.
*/
public function __destruct()
{
// unset($this);
}
/**
* A public function to grab and return content.
* @params boolean $grab, flag to perform real time grab or use class content
* @returned text $content, truncated text
*/
public function getContent($grab = false)
{
if ($grab) {
$this->grabContent();
}
return $this->content;
}
/**
* Extract all href links from grab contents.
* @params boolean $grab, flag to perform real time grab or use class content
* @returned array $href_links, an array with extracted hyper links
*/
public function getHrefLinks($grab = true)
{
if ($grab) {
$this->grabContent();
}
if (! is_null($this->content)) {
preg_match_all($this->href_expression, $this->content, $match_links);
$unique_urls = array_unique($match_links[1]);
if (count($unique_urls)) {
foreach ($unique_urls as $index => $url) {
$title = $this->findLinkTitle($url, $match_links[2][$index]);
if (! (preg_match($this->href_filter_pattern, $url, $filter_out_url)
|| preg_match($this->href_filter_pattern, $title, $filter_out_link))
) {
if (! preg_match($this->full_link_pattern, $url, $match)) {
$url = $this->sanitizeUrl($url);
}
if ($this->link_type !== self::LINK_TYPE_ALL) {
if ($this->getLinkType($url) !== $this->link_type) {
continue;
}
}
$this->href_links[] = [$url, $title];
}
}
}
}
return $this->href_links;
}
/**
* Extract all images sources from grabbed contents.
* @param bool $grab , flag to perform real time grab or use class content
* @return array, an array of extracted images sources
*/
public function getImageSources($grab = false)
{
if ($grab) {
$this->grabContent();
}
if (! is_null($this->content)) {
preg_match_all($this->img_expression, $this->content, $match_images);
if (isset($match_images[2]) && count($match_images[2])) {
foreach ($match_images[2] as $match_image) {
$match_image = trim($match_image);
if ($match_image) {
if (! preg_match($this->full_link_pattern, $match_image, $match)) {
$match_image = $this->sanitizeUrl($match_image);
}
$this->image_sources[] = $match_image;
}
}
}
}
$this->image_sources = array_values(array_unique(array_filter($this->image_sources)));
return $this->image_sources;
}
/**
* Extract title from grabbed contents.
* @param bool $grab , flag to perform real time grab or use class content
* @return array, an array of extracted metatags
*/
public function getTitle($grab = false)
{
$title = '';
if ($grab) {
$this->grabContent();
}
if (! is_null($this->content)) {
preg_match($this->title_expression, $this->content, $match_title);
$title = empty($match_title[1]) ? '' : $match_title[1];
}
return $title;
}
/**
* Extract all metatags sources from grabbed contents.
* @param bool $grab , flag to perform real time grab or use class content
* @return array, an array of extracted metatags
*/
public function getMetaTags($grab = false)
{
$metatags = [];
if ($grab) {
$this->grabContent();
}
if (! is_null($this->content)) {
preg_match_all($this->metatags_expression, $this->content, $match_tags);
if (isset($match_tags[2]) && count($match_tags[2])) {
foreach ($match_tags[2] as $key => $match_tag) {
$key = trim($match_tags[1][$key]);
$match_tag = trim($match_tag);
if ($match_tag) {
$metatags[] = [$key, $match_tag];
}
}
}
}
return $metatags;
}
/**
* Truncate text in to preferred length.
* @params text $text, input text to truncate
* @params int $length int, how many character to keep
* @params string $replace_by string, text to explain continuity
* @returned text $text, truncated text
*/
public function truncateText($text, $length = 50, $replace_by = '...')
{
$text_parts = explode('_____', wordwrap($text, $length, '_____', false));
$new_text = array_shift($text_parts);
if (strlen($text) > strlen($new_text)) {
return $new_text.$replace_by;
}
return $text;
}
/**
* Set link type to extract from grabbed contents.
* @params string $link_type
*/
public function setLinksType($link_type = 'all')
{
$this->link_type = $link_type;
}
/**
* Prepare base and full url from given website link to grab.
*/
private function setUrls()
{
$host = parse_url($this->target_url, PHP_URL_HOST);
$host = $host ? $host : parse_url($this->target_url, PHP_URL_PATH);
$this->base_url = 'http://'.rtrim($host, '/').'/';
$this->domain = $host;
$this->internal_link_pattern = str_replace('#domain#', $this->domain, $this->internal_link_pattern);
$this->absolute_url = substr($this->target_url, 0, strrpos($this->target_url, '/'));
$this->absolute_url = $this->absolute_url ? $this->absolute_url.'/' : $this->base_url;
}
/**
* A private method grabs website content using cUrl
* And put content it into a class variable
* Can be replace by file_get_contents() but it's very slow, cpu intensive
* and does not handle redirects, caching, cookies, etc.
*/
public function grabContent()
{
try {
$ch = curl_init($this->target_url);
curl_setopt_array($ch, $this->curl_options);
$this->content = curl_exec($ch);
if ($this->content === false) {
throw new \Exception();
}
} catch (\Exception $e) {
$this->message = 'Unable to grab site contents';
}
curl_close($ch);
}
public function sanitizeUrl($url)
{
if (strpos($url, '/') == 0) {
$url = $this->base_url.$url;
} elseif (strpos($url, './') == 0) {
$url = $this->base_url.substr($url, 2);
} else {
$url = $this->absolute_url.$url;
}
return $url;
}
private function getLinkType($url)
{
if (preg_match($this->internal_link_pattern, $url)) {
return self::LINK_TYPE_INTERNAL;
} elseif (preg_match($this->external_link_pattern, $url)) {
return self::LINK_TYPE_EXTERNAL;
}
return self::LINK_TYPE_UNKNOWN;
}
private function findLinkTitle($url, $link_content = '')
{
if (preg_match_all($this->href_filter_pattern, $link_content, $matches)) {
if (preg_match_all($this->img_expression, $link_content, $match_images)) {
if (isset($match_images[2]) && isset($match_images[2][0])) {
$image_name = substr($match_images[2][0], strripos($match_images[2][0], '/', 1) + 1);
return (strlen($match_images[2][0]) > strlen($image_name) ? 'Image:' : '').$image_name;
}
} else {
return $url;
}
}
return $link_content;
}
private function _isCurl()
{
if (! function_exists('curl_version')) {
die('cUrl library is not enabled on this server.');
}
}
}