PHP Classes

File: docs/files/UrlExtractor.php.txt

Recommend this page to a friend!
  Classes of Joćo Ribeiro   PHP URL Extractor   docs/files/UrlExtractor.php.txt   Download  
File: docs/files/UrlExtractor.php.txt
Role: Documentation
Content type: text/plain
Description: Documentation
Class: PHP URL Extractor
Extract URLs of images and metadata from Web pages
Author: By
Last change: Generated updated documentation
Updated class docs
Date: 8 years ago
Size: 12,032 bytes
 

Contents

Class file image Download
<?php /** * Class UrlExtractor * * PHP version 5 * * @category Utilities * @package UrlExtractor * @author Joao Ribeiro <joaopedrocr@gmail.com> * @license http://www.gnu.org/copyleft/gpl.html GNU General Public License * @link http://urlextractor.joaoperibeiro.com */ namespace rollbackpt\UrlExtractor; /** * PHP Class to extract images and meta data information from URLs. * * @category Utilities * @package UrlExtractor * @author Joao Ribeiro <joaopedrocr@gmail.com> * @license http://www.gnu.org/copyleft/gpl.html GNU General Public License * @link http://urlextractor.joaoperibeiro.com * * @TODO: Extract thumbnails from videos * @TODO: Split the code into smaller classes (One to handle meta tags * and other to handle images and thumbnails) * @TODO: Change get_meta_tags to Regex * @TODO: Add method to get existing extracted data */ class UrlExtractor { /* * Use this const to define if you want to use curl or file_get_contents to * get the url contents */ const CURL = true; /** * URL passed as a parameter in construct * * @var string $url */ protected $url; /** * Host extracted from the URL * * @var string $host */ protected $host; /** * Array to store all the images extracted from the URL * * @var array $images */ public $images = array(); /** * Title extracted from the URL * * @var string $title */ public $title; /** * Description extracted from the URL * * @var string $description */ public $description; /** * Array to store the keywords extracted from the URL * * @var array $keywords */ public $keywords = array(); /** * Array containing the name of the meta tags to be extracted * * @var array $metaTagNames */ protected $metaTagNames = array( 'title' => array( 'twitter:title', 'og:title' ), 'description' => array( 'description', 'twitter:description', 'og:description' ), 'keywords' => array( 'keywords' ), 'images' => array( 'twitter:image', 'twitter:image:src', 'og:image' ) ); /** * Class contructor. * * @return void */ public function __construct() { // Empty constructor for future implementations } /** * Function extractAll * * Extract all the elements from the URL * (title, description, keywords and images) * * @param string $url * @param boolean $json Define if the result is returned in an array or a * json string. * * @return array|string Return an array or JSON string with the url info * (title, description, keywords and images) or and error message */ public function extractAll($url, $json = true) { // Check the url parameter if (!empty($url)) { $this->url = $url; } else { throw new Exception("URL can\'t be empty!"); } // Clean variables from old calls $this->host = ""; $this->title = ""; $this->description = ""; $this->keywords = array(); $this->images = array(); // Get the url contents for extraction if (self::CURL) { $urlContent = $this->curlGetContents($this->url); } else { $urlContent = @file_get_contents($this->url); } // Avoid errors in the Regex matcher because of glued metatags $urlContent = str_replace("<meta", "\n<meta", $urlContent); if ($urlContent !== false) { $this->getHost($this->url); $this->getMetaTagsByProperty($urlContent); $this->getPageTitle($urlContent); $this->getMetaTagsByName($urlContent); $this->getImages($urlContent); $urlInfo = array( 'title' => $this->title, 'description' => $this->description, // Before assign, remove duplicate images and reorder the array 'keywords' => array_values(array_unique($this->keywords)), // Before assign, remove duplicate images and reorder the array 'images' => array_values(array_unique($this->images)) ); return ($json) ? json_encode($urlInfo) : $urlInfo; } return ($json) ? json_encode(array('error' => 'Invalid URL')) : array('error' => 'Invalid URL'); } /** * Function getHost * * Get the host from the URL (Ex: http://localhost.com * is the host extracted from http://localhost.com/test/index.php) * * @param string $url * * @return void */ protected function getHost($url) { $pattern = '/([^:]*:\/\/)?([^\/]*\.)*([^\/\.]+\.[^\/]+)/i'; preg_match($pattern, $url, $results); $this->host = $results[0]; } /** * Function getPageTitle * * Get the text inside the title tag * * @param string $urlContent Page content to get the title from * * @return void */ protected function getPageTitle($urlContent) { $this->title = $this->getText($urlContent, "<title>", "</title>"); } /** * Function getMetaTagsByName * * Get the regular meta tags (Description, keywords, etc..) * * @param string $urlContent Url content to get meta tags from * * @return void */ protected function getMetaTagsByName($urlContent) { $pattern = '/<meta.*?name=["|\'](description|keywords)["|\'][^<]*?content=["|\'](.*?)["|\'].*?>|<meta.*?content=["|\'](.*?)["|\'][^<]*?name=["|\'](description|keywords)["|\'].*?>/i'; preg_match_all($pattern, $urlContent, $results); $metaTags = $this->formatMetaTagsArray($results); if ($metaTags !== false) { $this->setUrlAtributes($metaTags); } } /** * Function getMetaTagsByProperty * * Get property meta tags like open graph for example * (Ex: <meta property="og:title" content="The Rock" />) * * @param string $urlContent Url content to get meta tags from * * @return void */ protected function getMetaTagsByProperty($urlContent) { $pattern = '/<meta.*?property=["|\'](.*?)["|\'][^<]*?content=["|\'](.*?)["|\'].*?>|<meta.*?content=["|\'](.*?)["|\'][^<]*?property=["|\'](.*?)["|\'].*?>/i'; preg_match_all($pattern, $urlContent, $results); $metaTags = $this->formatMetaTagsArray($results); if ($metaTags !== false) { $this->setUrlAtributes($metaTags); } } /** * Function getImages * * Get the images from the URL * * @param string $urlContent * * @return void */ protected function getImages($urlContent) { $pattern = '/<img.*?src=["|\'](.*?)["|\'].*?>/i'; preg_match_all($pattern, $urlContent, $results); foreach ($results[1] as $image) { $image = $this->checkImageUrl($image); if ($image !== null) { array_push($this->images, $image); } } } /** * Function setUrlAtributes * * Set the class atributes and overwrite duplicates * (Ex: Description, Keywords) * * @param string $metaTags * * @return void */ protected function setUrlAtributes($metaTags) { foreach ($this->metaTagNames as $key => $name) { foreach ($name as $value) { if (array_key_exists($value, $metaTags)) { if (is_array($this->$key)) { if (!empty($metaTags[$value])) { // Hard coded rule to split keywords by "," if ($key == 'keywords') { $metaTags[$value] = explode(",", $metaTags[$value]); foreach ($metaTags[$value] as $v) { array_push($this->$key, trim($v)); } } else { array_push($this->$key, $metaTags[$value]); } } } else { if (!empty($metaTags[$value])) { $this->$key = $metaTags[$value]; } } } } } } /** * Function formatMetaTagsArray * * Utility function used by getMetaTagsByProperty to * properly format the meta tag array expected by * setUrlAtributes * * @param array $array * * @return array|boolean Returns the array with the meta tags found or false * in case of not founding any meta tags */ protected function formatMetaTagsArray($array) { $pattern = '/^(' . $this->getPropertyRuleString() . ')/i'; foreach ($array as $key => $value) { if (preg_grep($pattern, $value)) { if ($key%2 == 0) { return array_combine($array[$key], $array[$key-1]); } else { return array_combine($array[$key], $array[$key+1]); } } } return false; } /** * Function checkImageUrl * * Utility function used by getImages to check image URL * and complete relative URLs * * @param string $url Url of the image to be checked * * @return string Image url */ protected function checkImageUrl($url) { $pattern = '/^[^(\.|\/)].*?[\.?].*?(.jpg|.gif|.png|.jpeg|.bmp)/i'; $pattern2 = '/(.jpg|.gif|.png|.jpeg|.bmp)$/i'; $url = preg_replace('/\.\.\//i', '', $url); if (!preg_match($pattern, $url)) { if (preg_match($pattern2, $url)) { return ($url[0] === '/') ? $this->host . $url : $this->host . '/' . $url; } } else { return $url; } } /** * Function getText * * Utility function that extract text between start and end points * * @param string $text * @param string $start * @param string $end * * @return string The text extracted */ protected function getText($text, $start, $end) { $a = explode($start, $text); $b = explode($end, $a[1]); return $b[0]; } /** * Function getPropertyRuleString * * Goes throug all the proprety names and generates a regex rule to match * at least one of them * * @return string Proprety names concateneted with | to make a regex rule */ protected function getPropertyRuleString() { $string = ""; foreach ($this->metaTagNames as $type) { foreach ($type as $tag) { $string .= $tag . "|"; } } return trim($string, "|"); } /** * Function curlGetContents * * Same as file_get_contents but using curl to avoid getting error 403 * forbidden because the request doesn't have a valid user agent * * @param string $url Url to get the contents from using Curl * * @return string $output Contents obtained from the url */ protected function curlGetContents($url) { // create curl resource $ch = curl_init(); // set url curl_setopt($ch, CURLOPT_URL, $url); //return the transfer as a string curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.13) Gecko/20080311 Firefox/2.0.0.13'); // $output contains the output string $output = curl_exec($ch); // close curl resource to free up system resources curl_close($ch); return $output; } }