php - Extracting Information from website IMDB always generates an error -
i'm trying use class grab imdb url , class intending grab imdb info url ex : http://www.imdb.com/title/tt0371746/
instead returns error : { $param['error'] = "no title found in search results!"; return $param; }
i generate error when there no movie @ url, why returning error every time? when there movie @ every url add ?!
this class :
<?php class imdbgrabber { function __construct() { $this->time = "now()"; } function getmovieinfo($input) { $param = array(); $imdburl = $this->scruburl($input); if ($imdburl === null) { $param['error'] = "no title found in search results!"; return $param; } $content = $this->geturl($imdburl); if (stripos($content, "<meta name=\"application-name\" content=\"imdb\" />") !== false) { $param = $this->grabinfo($content); $param['imdb_url'] = $imdburl; } else { $param['error'] = "no media found on imdb!"; } return $param; } function scruburl($input) { $url = "http://www.google.com/search?q=imdb+" . stripslashes(rawurlencode($input)); $content = $this->geturl($url); $urls = $this->match_all('/<a href="(http:\/\/www.imdb.com\/title\/tt.*?)".*?>.*?<\/a>/ms', $content, 1); if (!isset($urls[0])) return null; else return $urls[0]; } function geturl($url) { $ch = curl_init(); curl_setopt($ch, curlopt_url, $url); curl_setopt($ch, curlopt_returntransfer, 1); curl_setopt($ch, curlopt_connecttimeout, 5); curl_setopt($ch, curlopt_useragent, "mozilla/5.0 (windows nt 5.1; rv:2.0.1) gecko/20100101 firefox/4.0.1"); $content = curl_exec($ch); curl_close($ch); return $content; } function getimage($image) { header("content-type: image/jpeg"); $imdb_poster = rawurldecode($image); $image = curl_init(); curl_setopt($image, curlopt_url, $imdb_poster); curl_setopt($image, curlopt_returntransfer, 1); curl_setopt($image, curlopt_connecttimeout, 5); $data = curl_exec($image); curl_close($image); return $data; } function match_all($regex, $str, $i = 0) { if (preg_match_all($regex, $str, $matches) === false) return false; else return $matches[$i]; } function match($regex, $str, $i = 0) { if (preg_match($regex, $str, $match) == 1) return $match[$i]; else return false; } function grabinfo($content) { $param = array(); $param['title_id'] = $this->match('/<link rel="canonical" href="http:\/\/www.imdb.com\/title\/(tt[0-9]+)\/" \/>/ms', $content, 1); $param['title'] = trim($this->match('/<title>(.*?) \(.*?<\/title>/ms', $content, 1)); $param['type'] = $this->match('/<meta.*?property=.og:type.*?content=.(.*?)(\'|")/ms', $content, 1); $param['year'] = trim($this->match('/<title>.*?\(.*?([0-9][0-9][0-9][0-9]).*?\).*?<\/title>/ms', $content, 1)); $param['rating'] = $this->match('/<span itemprop="ratingvalue">([0-9].[0-9])<\/span>/m', $content, 1); $param['ratingcount'] = $this->match('/<span itemprop="ratingcount">(.*?)<\/span>/m', $content, 1); $param['reviewcount'] = $this->match('/<span itemprop="reviewcount">(.*?)<\/span>/m', $content, 1); $param['trailer'] = $this->match('|<a href="{0,1}(/video/imdb/vi\d*/)|ims', $content, 1); $param['genres'] = array(); foreach ($this->match_all('/<a.*?>(.*?)<\/a>/ms', $this->match('/genre.?:(.*?)(<\/div>|see more)/ms', $content, 1), 1) $m) { array_push($param['genres'], $m); } $param[genres] = is_array(($param[genres])) ? implode(", ", ($param[genres])) : ($param[genres]); $param['directors'] = array(); foreach ($this->match_all('/<a.*?>(.*?)<\/a>/ms', $this->match('/director.?:(.*?)(<\/div>|>.?and )/ms', $content, 1), 1) $m) { array_push($param['directors'], $m); } $param[directors] = is_array(($param[directors])) ? implode(", ", ($param[directors])) : ($param[directors]); $param['writers'] = array(); foreach ($this->match_all('/<a.*?>(.*?)<\/a>/ms', $this->match('/writer.?:(.*?)(<\/div>|>.?and )/ms', $content, 1), 1) $m) { array_push($param['writers'], $m); } $param[writers] = is_array(($param[writers])) ? implode(", ", ($param[writers])) : ($param[writers]); $param['stars'] = array(); foreach ($this->match_all('/<a.*?>(.*?)<\/a>/ms', $this->match('/stars:(.*?)<\/div>/ms', $content, 1), 1) $m) { array_push($param['stars'], $m); } $param[stars] = is_array(($param[stars])) ? implode(", ", ($param[stars])) : ($param[stars]); $param['cast'] = array(); foreach ($this->match_all('/<td class="name">(.*?)<\/td>/ms', $content, 1) $m) { array_push($param['cast'], trim(strip_tags($m))); } $param[cast] = is_array(($param[cast])) ? implode(", ", ($param[cast])) : ($param[cast]); $param['mpaa_rating'] = $this->match('/infobar">.<img.*?alt="(.*?)".*?>/ms', $content, 1); if ($param['title_id'] != "") { $releaseinfohtml = $this->geturl("http://www.imdb.com/title/" . $param['title_id'] . "/releaseinfo"); $param['also_known_as'] = $this->getakatitles($releaseinfohtml, $usa_title); $param[also_known_as] = is_array(($param[also_known_as])) ? implode("<br />", ($param[also_known_as])) : ($param[also_known_as]); $param['usa_title'] = $usa_title; $param['release_date'] = $this->match('/release date:<\/h4>.*?([0-9][0-9]? (january|february|march|april|may|june|july|august|september|october|november|december) (19|20)[0-9][0-9]).*?(\(|<span)/ms', $content, 1); $param['release_dates'] = $this->getreleasedates($releaseinfohtml); $param[release_dates] = is_array(($param[release_dates])) ? implode("<br />", ($param[release_dates])) : ($param[release_dates]); } $param['plot'] = trim(strip_tags($this->match('/users:.*?<p>(.*?)(<\/p>|<a)/ms', $content, 1))); $param['poster'] = $this->match('/img_primary">.*?<img src="(.*?)".*?<\/td>/ms', $content, 1); $param['poster_large'] = ""; $param['poster_small'] = ""; if ($param['poster'] != '' && strrpos($param['poster'], "nopicture") === false && strrpos($param['poster'], "ad.doubleclick") === false) { $param['poster_large'] = substr($param['poster'], 0, strrpos($param['poster'], "_v1.")) . "_v1._sy500.jpg"; $param['poster_small'] = substr($param['poster'], 0, strrpos($param['poster'], "_v1.")) . "_v1._sy150.jpg"; } else { $param['poster'] = ""; } $param['runtime'] = trim($this->match('/runtime:<\/h4>.*?([0-9]+) min.*?<\/div>/ms', $content, 1)); if ($param['runtime'] == '') $param['runtime'] = trim($this->match('/infobar.*?([0-9]+) min.*?<\/div>/ms', $content, 1)); $param['oscars'] = trim($this->match('/won ([0-9]+) oscars./ms', $content, 1)); $param['awards'] = trim($this->match('/([0-9]+) wins/ms', $content, 1)); $param['nominations'] = trim($this->match('/([0-9]+) nominations/ms', $content, 1)); $param['storyline'] = trim(strip_tags($this->match('/storyline<\/h2>(.*?)(<em|<\/p>|<span)/ms', $content, 1))); $param['release_date'] = trim(strip_tags($this->match('/release date.?:(.*?)(<\/div>|see more)/ms', $content, 1))); $param['keywords'] = array(); foreach ($this->match_all('/<a.*?>(.*?)<\/a>/ms', $this->match('/plot keywords.?:(.*?)(<\/div>|see more)/ms', $content, 1), 1) $m) { array_push($param['keywords'], $m); } $param[keywords] = is_array(($param[keywords])) ? implode(", ", ($param[keywords])) : ($param[keywords]); $param['tagline'] = trim(strip_tags($this->match('/tagline.?:<\/h4>(.*?)(<span|<\/div)/ms', $content, 1))); $param['votes'] = $this->match('/href="ratings".*?>([0-9]+,?[0-9]*) votes<\/a>\)/ms', $content, 1); $param[votes] = is_array(($param[votes])) ? implode(", ", ($param[votes])) : ($param[votes]); $param['languages'] = $this->match_all('/a href="\/language\/.*?">(.*?)<\/a>/ms', $content, 1); $param['languages'] = array_unique($param['languages']); $param[languages] = is_array(($param[languages])) ? implode(", ", ($param[languages])) : ($param[languages]); $param['countries'] = array(); foreach ($this->match_all('/<a.*?>(.*?)<\/a>/ms', $this->match('/country.?:(.*?)(<\/div>|see more)/ms', $content, 1), 1) $m) { array_push($param['countries'], $m); } $param[countries] = is_array(($param[countries])) ? implode(", ", ($param[countries])) : ($param[countries]); $param['companies'] = $this->match_all('/a.*?href="\/company\/.*?">(.*?)<\/a>/ms', $content, 1); $param['companies'] = array_unique($param['companies']); $param[companies] = is_array(($param[companies])) ? implode(", ", ($param[companies])) : ($param[companies]); return $param; } function getreleasedates($content) { $releasedates = array(); foreach ($this->match_all('/<tr>(.*?)<\/tr>/ms', $this->match('/date<\/th><\/tr>(.*?)<\/table>/ms', $content, 1), 1) $r) { $country = trim(strip_tags($this->match('/<td><b>(.*?)<\/b><\/td>/ms', $r, 1))); $date = trim(strip_tags($this->match('/<td align="right">(.*?)<\/td>/ms', $r, 1))); array_push($releasedates, $country . " = " . $date); } return $releasedates; } function getakatitles($content, &$usa_title) { $akatitles = array(); foreach ($this->match_all('/<tr>(.*?)<\/tr>/msi', $this->match('/also known as(.*?)<\/table>/ms', $content, 1), 1) $m) { $akatitlematch = $this->match_all('/<td>(.*?)<\/td>/ms', $m, 1); $akatitle = trim($akatitlematch[0]); $akacountry = trim($akatitlematch[1]); array_push($akatitles, $akatitle . " = " . $akacountry); if ($akacountry != '' && strrpos(strtolower($akacountry), "usa") !== false) $usa_title = $akatitle; } return $akatitles; }
}
there few errors behind this:
- in
imdbgrabber::scruburl($input)
method there wrong regexp, there may characters after double quote , before http. if you, i'd rather use google custom search engine api search it. current approach you'll banned after few hundreds-thousands attempts. fixed regexp be:
$urls = $this->match_all('/<a[\s\s]*?href="[\s\s]*?(http[s]{0,1}:\/\/www.imdb.com\/title\/[\s\s]*?)\//', $content, 1);
- the condition
stripos($content, "<meta name=\"application-name\"...
you're trying test seems wrong. downloaded htmltitle/tt0371746/
, there no such string. i'd useif (stripos($content, "your rating:") !== false) {
after 2 changes script outputs this:
array(34) { ["title_id"]=> string(9) "tt0371746" ["title"]=> string(8) "iron man" ["type"]=> string(11) "video.movie" ["year"]=> string(4) "2008" ["rating"]=> string(3) "7.9" ["ratingcount"]=> string(7) "578,477" ["reviewcount"]=> string(10) "1,017 user" ["trailer"]=> string(24) "/video/imdb/vi447873305/" ["genres"]=> string(28) " action, adventure, sci-fi" ["directors"]=> string(57) "<span class="itemprop" itemprop="name">jon favreau</span>" ["writers"]=> string(131) "<span class="itemprop" itemprop="name">mark fergus</span>, <span class="itemprop" itemprop="name">hawk ostby</span>, 6 more credits" ["stars"]=> string(214) "<span class="itemprop" itemprop="name">robert downey jr.</span>, <span class="itemprop" itemprop="name">gwyneth paltrow</span>, <span class="itemprop" itemprop="name">terrence howard</span>, see full cast , crew" ["cast"]=> string(0) "" ["mpaa_rating"]=> bool(false) ["also_known_as"]=> string(0) "" ["usa_title"]=> null ["release_date"]=> string(24) "1 may 2008 (netherlands)" ["release_dates"]=> string(0) "" ["plot"]=> string(0) "" ["poster"]=> string(0) "" ["poster_large"]=> string(0) "" ["poster_small"]=> string(0) "" ["runtime"]=> string(3) "126" ["oscars"]=> string(0) "" ["awards"]=> string(2) "18" ["nominations"]=> string(2) "51" ["storyline"]=> string(856) "tony stark. genius, billionaire, playboy, philanthropist. son of legendary inventor , weapons contractor howard stark. when tony stark assigned give weapons presentation iraqi unit led lt. col. james rhodes, he's given ride on enemy lines. ride ends badly when stark's humvee he's riding in attacked enemy combatants. survives - barely - chest full of shrapnel , car battery attached heart. in order survive comes way miniaturize battery , figures out battery can power else. iron man born. uses primitive device escape cave in iraq. once home, begins work on perfecting iron man suit. man put in charge of stark industries has plans of own take on tony's technology other matters." ["keywords"]=> string(304) " <span class="itemprop" itemprop="keywords">armor</span>, <span class="itemprop" itemprop="keywords">cave</span>, <span class="itemprop" itemprop="keywords">iron</span>, <span class="itemprop" itemprop="keywords">genius</span>, <span class="itemprop" itemprop="keywords">missile</span>, see (198)" ["tagline"]=> string(52) "get ready different breed of heavy metal hero." ["votes"]=> bool(false) ["languages"]=> string(153) "|</span> <a href="/language/fa?ref_=tt_dt_dt" itemprop='url'>persian, |</span> <a href="/language/ar?ref_=tt_dt_dt" itemprop='url'>arabic" ["countries"]=> string(3) "usa" ["companies"]=> string(75) "paramount pictures</span>, marvel enterprises</span>, marvel studios</span>" ["imdb_url"]=> string(36) "http://www.imdb.com/title/tt0371746/" }
with bunch of php notices.
but nightmare maintain , have say, code not best of best. consider using this approach or parser based on xml/xpath addressing. see this well.
Comments
Post a Comment