File: D:/HostingSpaces/blijegasten/blijegasten.be/app/Komma/Globalization/UpdateServices/WebScraper.php
<?php declare(strict_types=1);
namespace App\Komma\Globalization\UpdateServices;
use Carbon\Carbon;
use Exception;
use GuzzleHttp\Client;
use function GuzzleHttp\Psr7\parse_header;
use Illuminate\Console\OutputStyle;
use Illuminate\Support\Facades\Storage;
use Symfony\Component\DomCrawler\Crawler;
class WebScraper implements UpdateServiceInterface
{
const LOG_PREFIX = 'WebScraper: ';
const DOWNLOAD_FOLDER = 'WebScraper/';
const DATASET_DIR = __DIR__.DIRECTORY_SEPARATOR.'..'.DIRECTORY_SEPARATOR.'Datasets'.DIRECTORY_SEPARATOR;
private $stepCounter = 0;
/**
* @var OutputStyle
*/
private $output;
/**
* The functions in order which to call to build the culture info data
*
* @var array
*/
private $steps = [
'makeDownloadFolder',
'downloadFiles',
'transpileData',
];
/**
* Make geo data from the geonames.org service. This is the entry point for this service.
*/
public function doUpdate()
{
foreach($this->steps as $step) {
call_user_func([$this, $step]);
}
if($this->output) $this->output->writeln($this->formatStepMessage('Done!'));
}
private function makeDownloadFolder()
{
if(!Storage::exists(self::DOWNLOAD_FOLDER)) Storage::makeDirectory(self::DOWNLOAD_FOLDER);
if(!Storage::exists(self::DOWNLOAD_FOLDER)) throw new \RuntimeException(self::formatMessage('Could not create download folder. Check permissions: '.Storage::path(self::DOWNLOAD_FOLDER)));
}
/**
* Clears the storage folder in which the downloaded zip and its extracted data is placed
*/
private function clearDownloadFolder()
{
Storage::deleteDirectory(self::DOWNLOAD_FOLDER);
Storage::makeDirectory(self::DOWNLOAD_FOLDER);
}
/**
* Downloads the zipped xml from Realworks.
*
* Warning. Strict regulations by Realworks apply. Please check their documentation for the latest info on those
* regulations. At the time of writing they are:
*
* - Only call the api once a day after 08:30 in the morning.
* - The media that is defined in the xml must be downloaded for displaying.
*/
private function downloadFiles()
{
if($this->output) $this->output->writeln($this->formatStepMessage('Downloading globalisation files.'));
foreach($this->filesToDownload() as $name => $url) {
$relativePath = self::DOWNLOAD_FOLDER.$name;
if(!Storage::exists($relativePath) || (Storage::lastModified($relativePath) < Carbon::now()->subDay()->timestamp)) { //Only download the fileContents it it is older then a day. Geonames dumps new stuff each day.
Storage::delete(Storage::path($name)); //Delete the old file
try {
//Stream download, allows big file downloads without memory exhaustion.
$client = new Client();
$this->output->write($this->formatMessage('Starting to download '.$url.'...'));
$response = $client->get($url);
$type = $response->getHeader('content-type');
$parsed = parse_header($type);
$progressBar = null;
if($this->output) $progressBar = $this->output->createProgressBar($response->getBody()->getSize() / 1024); //Display progress in megabytes
while (!$response->getBody()->eof()) {
$data = $response->getBody()->read(1024);
$utf8Data = mb_convert_encoding($data, 'UTF-8', $parsed[0]['charset'] ?: 'UTF-8');
Storage::append(self::DOWNLOAD_FOLDER.DIRECTORY_SEPARATOR.$name, $utf8Data);
if($progressBar) $progressBar->advance(1);
}
if($progressBar) {
$progressBar->finish();
$this->output->writeln('');
}
if($this->output) $this->output->writeln($this->formatMessage('Downloaded '.$url.'. successfully.'));
} catch (Exception $exception) {
if($this->output) $this->output->writeln($this->formatMessage('Could not download "'.$url.'".'));
}
} else {
$this->output->writeln($this->formatMessage('File from url "'.$url.'" did not have to be downloaded since it already was today.'));
}
}
}
private function transpileData()
{
if($this->output) $this->output->writeln($this->formatStepMessage('Transpiling downloaded data into culture info data that the app understands'));
$countryRegionArrays = $this->transpileCountryRegionArrays();
$countryRegionArrays = $this->updateCountryRegionArraysWithNativeNames($countryRegionArrays);
$countryRegionArrays = $this->updateCountryRegionArraysWithCurrencySymbols($countryRegionArrays);
$countryRegionArrays = $this->updateCountryRegionArraysWithMetricationStatus($countryRegionArrays);
$countryRegionArrays = $this->updateCountryRegionArraysWithFormattingAndMajorMinorCurrencyInfo($countryRegionArrays);
$languagesArray = $this->transpileLanguageArrays();
$this->createLanguagesClass($languagesArray);
$this->createCountryClass($countryRegionArrays);
}
/**
* @param $languagesArray
*/
private function createLanguagesClass($languagesArray) {
$languageString = var_export($languagesArray, true);
$fileData = <<<EOT
<?php
namespace App\Komma\Globalization\Datasets;
/**
* Class Languages
*
* @package App\Komma\Globalization
*/
abstract class Languages
{
/** @var array Language data. */
public const DATA = {$languageString};
}
EOT;
file_put_contents(self::DATASET_DIR.'Languages.php', $fileData);
$this->output->writeln('Create the language class at: '.__DIR__.DIRECTORY_SEPARATOR.'..'.DIRECTORY_SEPARATOR.'Languages.php');
}
/**
* @param $countryArray
*/
private function createCountryClass($countryArray)
{
$countriesString = var_export($countryArray, true);
$fileData = <<<EOT
<?php
namespace App\Komma\Globalization\Datasets;
/**
* Class Countries
*
* @package App\Komma\Globalization
*/
abstract class Countries
{
/** @var array Language data. */
public const DATA = {$countriesString};
}
EOT;
file_put_contents(self::DATASET_DIR.'Countries.php', $fileData);
}
/**
* Creates the same array structure as the Culture class has. But only the country/regions.
*
* @return array
* @throws \Illuminate\Contracts\Filesystem\FileNotFoundException
*/
private function transpileCountryRegionArrays()
{
$countryRegionArrays = [];
if (Storage::exists(self::DOWNLOAD_FOLDER . 'countryInfo.txt')) {
//Read the country info file
$countryInfoGeoNames = Storage::get(self::DOWNLOAD_FOLDER . 'countryInfo.txt');
$countryInfoGeoNames = preg_split('/\r\n|\r|\n/', $countryInfoGeoNames);
//Filter out comments
$countryInfoGeoNames = array_filter($countryInfoGeoNames, function ($line) {
return (substr($line, 0, 1) !== '#');
});
//Filter out incorrect entries (with less then 19 columns)
$countryInfoGeoNames = array_filter($countryInfoGeoNames, function ($line) {
return count(preg_split('/\t/', $line)) == 19;
});
//Fix the indexes
$countryInfoGeoNames = array_values($countryInfoGeoNames);
//Build culture class compatible data
//First build the country/region arrays
if ($this->output) {
$this->output->writeln($this->formatMessage('Transpiling country/region data arrays for ' . count($countryInfoGeoNames) . ' countries / regions...'));
}
$cultureDataByLanguages = [];
foreach ($countryInfoGeoNames as $countryData) {
$countryData = preg_split('/\t/', $countryData);
$countryRegionArraysCount = count($countryRegionArrays);
//Extract region/country array
$countryRegionArrays[$countryRegionArraysCount] = [
'Name' => array_key_exists(4, $countryData) ? $countryData[0] : '', //ISO-3166
'DisplayName' => array_key_exists(4, $countryData) ? $countryData[4] : '',
'NativeName' => '', //Update me later on in updateCountryRegionArraysWithNativeNames
'IsMetric' => true, //Update me later on
'TwoLetterISORegionName' => array_key_exists(0, $countryData) ? $countryData[0] : '', //ISO-3166
'ThreeLetterISORegionName' => array_key_exists(1, $countryData) ? $countryData[1] : '', //ISO-3166
'CurrencyName' => array_key_exists(11, $countryData) ? $countryData[11] : '',
'CurrencyMinorName' => '', //Update me later on
'CurrencySymbol' => '', //Update me later on //ISO-4217
'ISOCurrencySymbol' => array_key_exists(10, $countryData) ? $countryData[10] : '',
'ISOLanguages' => [], //ISO-3166
'numberFormatting' => [
'currencyDecimalDigits' => 2, //Update me later on
'currencyDecimalSeparator' => '.', //Update me later on
'currencyGroupSeparator' => ' ', //Update me later on
'currencyGroupSizes' => 3 //Update me later on
]
];
//Extract the languages out of it and put them in
if(array_key_exists(15, $countryData)) {
$languageCodes = explode(',', $countryData[15]);
$languageCodes = array_map(function($languageCode) {
$languageCode = trim(strtoupper($languageCode));
//if it contains a dash. it is the last value that is the language. The first one is a sub language
$parts = explode('-', $languageCode);
if(is_array($parts)) $languageCode = $parts[0];
return $languageCode;
}, $languageCodes);
$countryRegionArrays[$countryRegionArraysCount]['ISOLanguages'] = $languageCodes;
}
}
if ($this->output) {
$this->output->writeln($this->formatMessage('Country / region data arrays transpiled.'));
}
}
return $countryRegionArrays;
}
/**
* @param $countryRegionArrays
* @throws \Illuminate\Contracts\Filesystem\FileNotFoundException
* @return
*/
private function updateCountryRegionArraysWithNativeNames($countryRegionArrays)
{
if(Storage::exists(self::DOWNLOAD_FOLDER . 'wiki_countries_native_names.html')) {
$countryRegionArraysHtml = Storage::get(self::DOWNLOAD_FOLDER . 'wiki_countries_native_names.html');
$crawler = new Crawler($countryRegionArraysHtml);
//English name selector (exonym): '.wikitable > tbody > tr > td:nth-child(1) > b > a'
$names = []; //Structure English name => Native name
$crawler->filter('.wikitable > tbody > tr')->each(function(Crawler $row, $rowIndex) use($countryRegionArrays, &$names) {
$englishName = $row->filter('td:nth-child(1) > b > a');
$nativeName = $row->filter('td:nth-child(3) > b:first-child');
if ($nativeName->count() > 0 && $englishName->count() > 0) {
preg_match('/^.+?(?=\<[a-zA-Z]+\>)/', $nativeName->html(), $nativeNameText);
$nativeNameText = is_array($nativeNameText) && count($nativeNameText) > 0 ? $nativeNameText[0] : $nativeName->text();
$nativeNameText = preg_replace('/\n/', '', $nativeNameText);
if($englishName->text() == 'Russia') $nativeNameText = 'Rossiyab'; //The russia entry is defect. Manually correct it
$names[$englishName->text()] = $nativeNameText;
};
});
foreach($countryRegionArrays as $index => $countryRegionArray) {
if(isset($names[$countryRegionArray['DisplayName']])) {
$countryRegionArrays[$index]['NativeName'] = $names[$countryRegionArray['DisplayName']];
}
}
}
return $countryRegionArrays;
}
/**
* @param $countryRegionArrays
* @throws \Illuminate\Contracts\Filesystem\FileNotFoundException
* @return
*/
private function updateCountryRegionArraysWithCurrencySymbols($countryRegionArrays)
{
if(Storage::exists(self::DOWNLOAD_FOLDER . 'wiki_currency_symbols.html')) {
$currencySymbolsPageHtml = Storage::get(self::DOWNLOAD_FOLDER . 'wiki_currency_symbols.html');
$crawler = new Crawler($currencySymbolsPageHtml);
$symbols = []; //Structure Curency ISO Code => Symbol
$crawler->filter('#mw-content-text .wikitable tbody tr')->each(function(Crawler $row, $rowIndex) use($countryRegionArrays, &$symbols) {
$currencyIsoCode = $row->filter('td:nth-child(4)');
$currencySymbol = $row->filter('td:nth-child(3)');
if ($currencySymbol->count() > 0 && $currencyIsoCode->count() > 0) {
if(trim($currencySymbol->text()) !== '(none)' && trim($currencyIsoCode->text()) !== '(none)') {
$spaceSplitted = explode(' ', trim($currencySymbol->text()));
if(is_array($spaceSplitted)) {
$symbol = $spaceSplitted[0];
} else {
$symbol = $spaceSplitted;
}
$symbol = preg_replace('/\n/', '', $symbol);
$symbols[trim($currencyIsoCode->text())] = $symbol;
}
};
});
foreach($countryRegionArrays as $index => $countryRegionArray) {
if(isset($symbols[$countryRegionArray['ISOCurrencySymbol']])) {
$countryRegionArrays[$index]['CurrencySymbol'] = $symbols[$countryRegionArray['ISOCurrencySymbol']];
}
}
}
return $countryRegionArrays;
}
/**
* @param $countryRegionArrays
* @throws \Illuminate\Contracts\Filesystem\FileNotFoundException
* @return
*/
private function updateCountryRegionArraysWithMetricationStatus($countryRegionArrays)
{
if(Storage::exists(self::DOWNLOAD_FOLDER . 'wiki_metrication.html')) {
$metricationDataPageHtml = Storage::get(self::DOWNLOAD_FOLDER . 'wiki_metrication.html');
$crawler = new Crawler($metricationDataPageHtml);
$statusses = []; //Country name => Status
$crawler->filter('.wikitable > tbody tr')->each(function(Crawler $row, $rowIndex) use($countryRegionArrays, &$statusses) {
$country = $row->filter('td:nth-child(2)');
$status = $row->filter('td:nth-child(4)');
if ($status->count() > 0 && $country->count() > 0) {
$status = $status->text();
if(strpos($status, 'Almost entirely complete')) {
$status = 'Almost entirely complete';
} else if(strpos($status, 'Partially complete')) {
$status = 'Partially complete';
} else if(strpos($status, 'Some adoption')) {
$status = 'Some adoption';
} else if(strpos($status, 'Complete')) {
$status = 'Complete';
}
$country = trim($country->text());
preg_match('/^[\w ]+/', $country, $matches);
$country = $matches[0];
$statusses[$country] = $status;
};
});
foreach($countryRegionArrays as $index => $countryRegionArray) {
$alternativeNames = [
//Value from $countryRegionArrays //Value from wiki_metrication.html
'United States' => 'United States of America',
'United States Minor Outlying Islands' => 'United States of America'
];
$status = 'Complete';
if(isset($statusses[$countryRegionArray['DisplayName']])) {
$status = $statusses[$countryRegionArray['DisplayName']];
} elseif(isset($alternativeNames[$countryRegionArray['DisplayName']]))
{
$status = $statusses[$alternativeNames[$countryRegionArray['DisplayName']]];
}
$countryRegionArrays[$index]['IsMetric'] = $status == "Complete" ? true : false;
}
}
return $countryRegionArrays;
}
/**
* @param $countryRegionArrays
* @throws \Illuminate\Contracts\Filesystem\FileNotFoundException
* @return
*/
private function updateCountryRegionArraysWithFormattingAndMajorMinorCurrencyInfo($countryRegionArrays)
{
if(Storage::exists(self::DOWNLOAD_FOLDER . 'currency_format.html')) {
$metricationDataPageHtml = Storage::get(self::DOWNLOAD_FOLDER . 'currency_format.html');
$crawler = new Crawler($metricationDataPageHtml);
$formatInfo = []; //Country name => Status
$crawler->filter('table.tablestyle tr')->each(function(Crawler $row, $rowIndex) use($countryRegionArrays, &$statusses, &$formatInfo) {
$currencyIsoCode = $row->filter('td:nth-child(2)');
$minorUnit = $row->filter('td:nth-child(5)');
$format = $row->filter('td:nth-child(7)');
if ($currencyIsoCode->count() > 0) {
//defines some default
$formatInfo[$currencyIsoCode->text()] = [
'currencyDecimalDigits' => 2,
'currencyGroupSizes' => 3,
'currencyGroupSeparator' => ',',
'currencyDecimalSeparator' => '.',
'minor' => 'cents'
];
if($minorUnit->count() > 0) {
$minor = preg_replace('/\n/', '', $minorUnit->text());
if(str_replace(' ', '', $minor) !== '')
$formatInfo[$currencyIsoCode->text()]['minor'] = $minor;
}
//Format example "#,###.##"
if($format->count() > 0) {
$parts = explode('#', $format->text());
if(is_array($parts) && $parts[0] !== $format->text()) {
$groupSizes = 0;
$decimals = 0;
$decimalSeparator = null;
$groupSeparator = null;
$partsReverse = array_reverse($parts);
foreach ($partsReverse as $part) {
if($part == '') {
if(!$decimalSeparator) $decimals++;
elseif(!$groupSeparator) $groupSizes++;
} else {
if(!$decimalSeparator && !$groupSeparator) {
$decimalSeparator = $part;
} elseif($decimalSeparator && !$groupSeparator) {
$groupSizes++;
$groupSeparator = $part;
}
}
}
//The formatting info about euro does not seem to be correct. We manually correct it here. If more info seems to be incorrect, we need another datasource.
if($currencyIsoCode->text() === 'EUR') {
$groupSeparator = '.';
$decimalSeparator = ',';
}
$formatInfo[$currencyIsoCode->text()]['currencyDecimalDigits'] = $decimals;
$formatInfo[$currencyIsoCode->text()]['currencyGroupSizes'] = $groupSizes;
$formatInfo[$currencyIsoCode->text()]['currencyGroupSeparator'] = $groupSeparator ?? '';
$formatInfo[$currencyIsoCode->text()]['currencyDecimalSeparator'] = $decimalSeparator ?? '';
}
}
};
});
foreach($countryRegionArrays as $index => $countryRegionArray) {
if(isset($formatInfo[$countryRegionArray['ISOCurrencySymbol']])) {
$minor = $formatInfo[$countryRegionArray['ISOCurrencySymbol']]['minor'];
$countryRegionArrays[$index]['CurrencyMinorName'] = $minor;
$countryRegionArrays[$index]['numberFormatting'] = $formatInfo[$countryRegionArray['ISOCurrencySymbol']];
unset($countryRegionArrays[$index]['numberFormatting']['minor']);
}
}
}
return $countryRegionArrays;
}
/**
* @return null
* @throws \Illuminate\Contracts\Filesystem\FileNotFoundException
*/
public function transpileLanguageArrays()
{
$html = Storage::get(self::DOWNLOAD_FOLDER . 'wiki_iso_data.html');
$crawler = new Crawler();
$crawler->addHtmlContent($html, 'UTF-8');
$languageRows = [];
$crawler->filter('#Table > tbody > tr')->each(function(Crawler $row, $rowIndex) use(&$languageRows) {
$displayNameTableData = $row->filter('td:nth-child(3) > a');
$englishNameTableData = $row->filter('td:nth-child(3) > a');
$nativeNameTableData = $row->filter('td:nth-child(4)');
$ISO639OneTableData = $row->filter('td:nth-child(5)');
$ISO639TwoTableData = $row->filter('td:nth-child(6)');
$ISO639ThreeTableData = $row->filter('td:nth-child(8)');
// if($displayNameTableData->count() == 1 && $englishNameTableData->count() == 1 && $nativeNameTableData->count() == 1 && $ISO639OneTableData->count() == 1 && $ISO639TwoTableData->count() == 1 && $ISO639ThreeTableData->count() == 1) {
if($displayNameTableData->count() == 1 && $englishNameTableData->count() == 1) {
preg_match('/^\w+/u', $ISO639ThreeTableData->text(), $iso3);
$iso3 = (is_array($iso3) && isset($iso3[0])) ? $iso3[0] : '';
preg_match('/^.+?(?=,)/u', $nativeNameTableData->text(), $nativeNameFromCommaSeperatedList);
$nativeName = (is_array($nativeNameFromCommaSeperatedList) && isset($nativeNameFromCommaSeperatedList[0])) ? $nativeNameFromCommaSeperatedList[0] : $nativeNameTableData->text();
$nativeName = mb_convert_case($nativeName, MB_CASE_TITLE);
$languageRows[] = [
'DisplayName' => preg_replace('/\n/u','', $displayNameTableData->text()),
'EnglishName' => preg_replace('/\n/u','', $englishNameTableData->text()),
'NativeName' => $nativeName,
'ISO-639-1' => $ISO639OneTableData->text(),
'ISO-639-2' => $ISO639TwoTableData->text(),
'ISO-639-3' => $iso3,
];
}
});
return $languageRows;
}
private function filesToDownload()
{
return [
'countryInfo.txt' => 'https://download.geonames.org/export/dump/countryInfo.txt',
'wiki_iso_data.html' => 'https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes',
'wiki_countries_native_names.html' => 'https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_and_their_capitals_in_native_languages',
'wiki_currency_symbols.html' => 'https://en.wikipedia.org/wiki/List_of_circulating_currencies',
'wiki_metrication.html' => 'https://en.wikipedia.org/wiki/Metrication',
'currency_format.html' => 'https://www.thefinancials.com/Default.aspx?SubSectionID=curformat',
'countries_by_continent' => 'https://simple.wikipedia.org/wiki/List_of_countries_by_continents',
];
}
/**
* Set an output where can be logged to.
*
* @param OutputStyle $output
* @return mixed
*/
public function setOutputStyle(OutputStyle $output)
{
$this->output = $output;
}
/**
* @param $message
* @return string
*/
private function formatStepMessage($message)
{
return self::LOG_PREFIX.' ('.++$this->stepCounter.'/'.count($this->steps).') '.$message;
}
/**
* @param $message
* @return string
*/
private function formatMessage($message) {
return self::LOG_PREFIX.' '.$message;
}
}