Created
March 21, 2018 13:13
-
-
Save Bertware/12b22879c89d6aa8ab361d856ce56ebb to your computer and use it in GitHub Desktop.
Quick-and-dirty PHP script to Scrape information on all available episodes from vrtnu.be. create a json representation for each page, and a list which puts everything together
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
function file_put_contents_rec($file, $data) | |
{ | |
$file = ltrim($file, '/'); | |
$folder = substr($file, 0, -1 * strlen(basename($file))); | |
if (!is_dir($folder)) { | |
// dir doesn't exist, make it | |
mkdir($folder, 0777, true); | |
} | |
file_put_contents($file, $data); | |
} | |
// schedule: https://www.vrt.be/bin/epg/schedule.json | |
// live: https://services.vrt.be/videoplayer/r/live.json | |
// jsonp! | |
$html = file_get_contents("https://www.vrt.be/vrtnu/a-z/"); | |
$tidy = tidy_parse_string($html); | |
$tidy->cleanRepair(); | |
$xmldoc = new DOMDocument(); | |
@$xmldoc->loadHTML($tidy->html()->value); | |
$xPath = new Domxpath($xmldoc); | |
$xPath_vrtglossary_group = "/html/body/div[@class='main']/main/div/div[@class='vrtglossary__groups']/div"; | |
$xPath_vrtglossary_header_rel = "./div[@class='vrtglossary__group__title__letter']/h2/text()"; // use text content | |
$xPath_vrtglossary_item_rel = "./div/ul/li/a"; | |
$xPath_item_imgurl = "./div[@class='tile__image-wrapper']/div/picture/source/@srcset"; | |
$xPath_item_title = "./div[@class='tile__content-wrapper']/h3/text()"; // use text content | |
$xPath_item_description = "./div[@class='tile__content-wrapper']/div[@class='tile__description']"; // use text content | |
$queryResult = $xPath->query($xPath_vrtglossary_group); | |
$data = []; | |
$totalPrograms = 0; | |
foreach ($queryResult as $sectionElement) { | |
$headerText = $xPath->query($xPath_vrtglossary_header_rel, $sectionElement); | |
$items = $xPath->query($xPath_vrtglossary_item_rel, $sectionElement); | |
$sectionData = []; | |
foreach ($items as $programElement) { | |
$itemData = []; | |
$itemData['img'] = []; | |
foreach (explode(',', $xPath->query($xPath_item_imgurl, $programElement)->item(0)->textContent) as $img) { | |
$spaced = explode(' ', trim($img)); | |
$itemData['img'][$spaced[1]] = "https:" . $spaced[0]; | |
} | |
$itemData['title'] = trim($xPath->query($xPath_item_title, $programElement)->item(0)->textContent); | |
$itemData['description'] = trim($xPath->query($xPath_item_description, $programElement)->item(0)->textContent); | |
$itemData['api']['details'] = str_replace(".relevant/", ".json", $programElement->getAttribute('href')); | |
$itemData['url'] = "https://www.vrt.be" . $programElement->getAttribute('href'); | |
$itemData['relative'] = $programElement->getAttribute('href'); | |
$sectionData[] = $itemData; | |
$totalPrograms++; | |
} | |
$data[$headerText->item(0)->textContent] = $sectionData; | |
} | |
file_put_contents_rec("vrtnu/a-z.json", json_encode($data, JSON_UNESCAPED_SLASHES | JSON_NUMERIC_CHECK | JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE)); | |
$programs = []; | |
$i = 0; | |
foreach ($data as $section => $sectionPrograms) { | |
foreach ($sectionPrograms as $program) { | |
// All other episodes | |
$xPath_episodes = "/html/body/div[@class='main']/main/div/div[@class='episodeslist']/div[@class='list']/div/div[@class='vrtlist__body']/div/ul/li/a"; | |
$html = file_get_contents($program['url']); | |
$tidy = tidy_parse_string($html); | |
$tidy->cleanRepair(); | |
$xmldoc = new DOMDocument(); | |
@$xmldoc->loadHTML($tidy->html()->value); | |
$xPath = new Domxpath($xmldoc); | |
$episodeElements = $xPath->query($xPath_episodes); | |
$rawEpisodes = []; | |
foreach ($episodeElements as $episodeElement) { | |
$rawEpisode['relative'] = $episodeElement->getAttribute('href'); | |
$rawEpisode['url'] = "https://www.vrt.be" . $episodeElement->getAttribute('href'); | |
// replace the trailing slash of an episode url with ".mssecurevideo.json" to get json info. However, this requires an account, and accepting the terms (which we kinda can't) | |
// just parse for now and let the end user sign in | |
$rawEpisode['stream'] = rtrim($rawEpisode['url'], '/') . '.mssecurevideo.json'; | |
@$rawEpisode['date'] = $xPath->query("./div[@class='tile__content-wrapper']/div/div/span[@class='tile__broadcastdate--mobile']", $episodeElement)->item(0)->textContent; | |
$rawEpisode['id'] = basename('url'); | |
$rawEpisodes[] = $rawEpisode; | |
} | |
// fallback when no list is shown | |
if (count($episodeElements) == 0) { | |
$ch = curl_init($program['url']); | |
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); | |
curl_setopt($ch, CURLOPT_URL, $program['url']); | |
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); | |
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); | |
curl_exec($ch); | |
$fullUrl = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL); | |
curl_close($ch); | |
$rawEpisode['relative'] = substr($fullUrl, 18); | |
$rawEpisode['url'] = $fullUrl; | |
// replace the trailing slash of an episode url with ".mssecurevideo.json" to get json info. However, this requires an account, and accepting the terms (which we kinda can't) | |
// just parse for now and let the end user sign in | |
$rawEpisode['stream'] = rtrim($rawEpisode['url'], '/') . '.mssecurevideo.json'; | |
$rawEpisode['id'] = basename('url'); | |
$rawEpisodes[] = $rawEpisode; | |
} | |
foreach ($rawEpisodes as $rawEpisode) { | |
$html = file_get_contents($program['url']); | |
$tidy = tidy_parse_string($html); | |
$tidy->cleanRepair(); | |
$xmldoc = new DOMDocument(); | |
@$xmldoc->loadHTML($tidy->html()->value); | |
$xPath = new Domxpath($xmldoc); | |
$xPath_details = "/html/body/div[@class='main']/main/div/div[@class='content-container']/div[@class='content']"; | |
$xPath_episodeTitle = "./div[@class='content__container']/span[@class='content__title--episode']"; | |
$xPath_episodeShortDesc = "./div[@class='content__container']/span[@class='content__shortdescription']"; | |
$xPath_episodeDesc = "./div[@class='content__description']"; | |
$xPath_available = "./div[@class='content__container']/div[@class='content__metadata']/div[@class='content__availability']/span"; | |
$xPath_episodeNumber = "./div[@class='content__container']/div[@class='content__metadata']/div[@class='content__metadata--main']/div[@class='content__episode']/span"; | |
$xPath_duration = "./div[@class='content__container']/div[@class='content__metadata']/div[@class='content__metadata--main']/time"; | |
$xPath_country = "./div[@class='content__container']/div[@class='content__metadata']/div[@class='content__region']"; | |
$xPath_categories = "./div[@class='content__container']/ul[@class='content__categories']/li/a"; | |
$episodeDetails = $rawEpisode; | |
$queryResult = $xPath->query($xPath_details)->item(0); | |
if ($xPath->query($xPath_episodeTitle, $queryResult) && $xPath->query($xPath_episodeTitle, $queryResult)->count()) { | |
$episodeDetails['title'] = $xPath->query($xPath_episodeTitle, $queryResult)->item(0)->textContent; | |
} else { | |
$episodeDetails['title'] = $program['title']; | |
} | |
$episodeDetails['tagline'] = $xPath->query($xPath_episodeShortDesc, $queryResult)->item(0)->textContent; | |
$episodeDetails['description'] = trim($xPath->query($xPath_episodeDesc, $queryResult)->item(0)->textContent); | |
@$episodeDetails['available_until'] = trim($xPath->query($xPath_available, $queryResult)->item(0)->textContent); | |
$episodeDetails['duration'] = trim($xPath->query($xPath_duration, $queryResult)->item(0)->textContent); | |
@$episodeDetails['restrictions'] = trim($xPath->query($xPath_country, $queryResult)->item(0)->textContent); | |
@$episodeDetails['number'] = trim($xPath->query($xPath_episodeNumber, $queryResult)->item(0)->textContent); | |
$episodeDetails['categories'] = []; | |
foreach ($xPath->query($xPath_categories, $queryResult) as $category) { | |
$episodeDetails['categories'][] = $category->textContent; | |
} | |
$programs[$program['title']]['episodes'][] = $episodeDetails; | |
file_put_contents_rec(rtrim($episodeDetails['relative'], '/') . '.json', json_encode($episodeDetails, JSON_UNESCAPED_SLASHES | JSON_NUMERIC_CHECK | JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE)); | |
} | |
print "[" . date("H:i:s") . "] " . $i . "/" . $totalPrograms . " " . round(100 * $i / $totalPrograms) . "% " . $program['title'] . PHP_EOL; | |
$i++; | |
file_put_contents_rec($program['api']['details'], json_encode($program, JSON_UNESCAPED_SLASHES | JSON_NUMERIC_CHECK | JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE)); | |
} | |
} | |
file_put_contents_rec('vrtnu/list.json', json_encode($programs, JSON_UNESCAPED_SLASHES | JSON_NUMERIC_CHECK | JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE)); | |
?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment