HEX
Server: Microsoft-IIS/8.5
System: Windows NT YDAWBH120 6.3 build 9600 (Windows Server 2012 R2 Standard Edition) AMD64
User: tentjecom_web (0)
PHP: 7.4.14
Disabled: NONE
Upload Files
File: D:/HostingSpaces/SBogers10/farmfun.komma.pro/app/Komma/Globalization/UpdateServices/WebScraper.php
<?php

declare(strict_types=1);

namespace App\Komma\Globalization\UpdateServices;

use Carbon\Carbon;
use Exception;
use GuzzleHttp\Client;
use function GuzzleHttp\Psr7\parse_header;
use Illuminate\Console\OutputStyle;
use Illuminate\Support\Facades\Storage;
use Symfony\Component\DomCrawler\Crawler;

class WebScraper implements UpdateServiceInterface
{
    const LOG_PREFIX = 'WebScraper: ';

    const DOWNLOAD_FOLDER = 'WebScraper/';

    const DATASET_DIR = __DIR__.DIRECTORY_SEPARATOR.'..'.DIRECTORY_SEPARATOR.'Datasets'.DIRECTORY_SEPARATOR;

    private $stepCounter = 0;

    /**
     * @var OutputStyle
     */
    private $output;

    /**
     * The functions in order which to call to build the culture info data
     *
     * @var array
     */
    private $steps = [
        'makeDownloadFolder',
        'downloadFiles',
        'transpileData',
    ];

    /**
     * Make geo data from the geonames.org service. This is the entry point for this service.
     */
    public function doUpdate()
    {
        foreach ($this->steps as $step) {
            call_user_func([$this, $step]);
        }
        if ($this->output) {
            $this->output->writeln($this->formatStepMessage('Done!'));
        }
    }

    private function makeDownloadFolder()
    {
        if (! Storage::exists(self::DOWNLOAD_FOLDER)) {
            Storage::makeDirectory(self::DOWNLOAD_FOLDER);
        }
        if (! Storage::exists(self::DOWNLOAD_FOLDER)) {
            throw new \RuntimeException(self::formatMessage('Could not create download folder. Check permissions: '.Storage::path(self::DOWNLOAD_FOLDER)));
        }
    }

    /**
     * Clears the storage folder in which the downloaded zip and its extracted data is placed
     */
    private function clearDownloadFolder()
    {
        Storage::deleteDirectory(self::DOWNLOAD_FOLDER);
        Storage::makeDirectory(self::DOWNLOAD_FOLDER);
    }

    /**
     * Downloads the zipped xml from Realworks.
     *
     * Warning. Strict regulations by Realworks apply. Please check their documentation for the latest info on those
     * regulations. At the time of writing they are:
     *
     * - Only call the api once a day after 08:30 in the morning.
     * - The media that is defined in the xml must be downloaded for displaying.
     */
    private function downloadFiles()
    {
        if ($this->output) {
            $this->output->writeln($this->formatStepMessage('Downloading globalisation files.'));
        }
        foreach ($this->filesToDownload() as $name => $url) {
            $relativePath = self::DOWNLOAD_FOLDER.$name;
            if (! Storage::exists($relativePath) || (Storage::lastModified($relativePath) < Carbon::now()->subDay()->timestamp)) { //Only download the fileContents it it is older then a day. Geonames dumps new stuff each day.
                Storage::delete(Storage::path($name)); //Delete the old file
                try {
                    //Stream download, allows big file downloads without memory exhaustion.
                    $client = new Client();
                    $this->output->write($this->formatMessage('Starting to download '.$url.'...'));
                    $response = $client->get($url);
                    $type = $response->getHeader('content-type');
                    $parsed = parse_header($type);

                    $progressBar = null;
                    if ($this->output) {
                        $progressBar = $this->output->createProgressBar($response->getBody()->getSize() / 1024);
                    } //Display progress in megabytes
                    while (! $response->getBody()->eof()) {
                        $data = $response->getBody()->read(1024);
                        $utf8Data = mb_convert_encoding($data, 'UTF-8', $parsed[0]['charset'] ?: 'UTF-8');
                        Storage::append(self::DOWNLOAD_FOLDER.DIRECTORY_SEPARATOR.$name, $utf8Data);
                        if ($progressBar) {
                            $progressBar->advance(1);
                        }
                    }
                    if ($progressBar) {
                        $progressBar->finish();
                        $this->output->writeln('');
                    }
                    if ($this->output) {
                        $this->output->writeln($this->formatMessage('Downloaded '.$url.'. successfully.'));
                    }
                } catch (Exception $exception) {
                    if ($this->output) {
                        $this->output->writeln($this->formatMessage('Could not download "'.$url.'".'));
                    }
                }
            } else {
                $this->output->writeln($this->formatMessage('File from url "'.$url.'" did not have to be downloaded since it already was today.'));
            }
        }
    }

    private function transpileData()
    {
        if ($this->output) {
            $this->output->writeln($this->formatStepMessage('Transpiling downloaded data into culture info data that the app understands'));
        }
        $countryRegionArrays = $this->transpileCountryRegionArrays();
        $countryRegionArrays = $this->updateCountryRegionArraysWithNativeNames($countryRegionArrays);
        $countryRegionArrays = $this->updateCountryRegionArraysWithCurrencySymbols($countryRegionArrays);
        $countryRegionArrays = $this->updateCountryRegionArraysWithMetricationStatus($countryRegionArrays);
        $countryRegionArrays = $this->updateCountryRegionArraysWithFormattingAndMajorMinorCurrencyInfo($countryRegionArrays);

        $languagesArray = $this->transpileLanguageArrays();
        $this->createLanguagesClass($languagesArray);
        $this->createCountryClass($countryRegionArrays);
    }

    /**
     * @param $languagesArray
     */
    private function createLanguagesClass($languagesArray)
    {
        $languageString = var_export($languagesArray, true);

        $fileData = <<<EOT
<?php
namespace App\Komma\Globalization\Datasets;

/**
 * Class Languages
 *
 * @package App\Komma\Globalization
 */
abstract class Languages
{
    /** @var array Language data. */
    public const DATA = {$languageString};
}
EOT;

        file_put_contents(self::DATASET_DIR.'Languages.php', $fileData);
        $this->output->writeln('Create the language class at: '.__DIR__.DIRECTORY_SEPARATOR.'..'.DIRECTORY_SEPARATOR.'Languages.php');
    }

    /**
     * @param $countryArray
     */
    private function createCountryClass($countryArray)
    {
        $countriesString = var_export($countryArray, true);

        $fileData = <<<EOT
<?php
namespace App\Komma\Globalization\Datasets;

/**
 * Class Countries
 *
 * @package App\Komma\Globalization
 */
abstract class Countries
{
    /** @var array Language data. */
    public const DATA = {$countriesString};
}
EOT;

        file_put_contents(self::DATASET_DIR.'Countries.php', $fileData);
    }

    /**
     * Creates the same array structure as the Culture class has. But only the country/regions.
     *
     * @return array
     * @throws \Illuminate\Contracts\Filesystem\FileNotFoundException
     */
    private function transpileCountryRegionArrays()
    {
        $countryRegionArrays = [];
        if (Storage::exists(self::DOWNLOAD_FOLDER.'countryInfo.txt')) {
            //Read the country info file
            $countryInfoGeoNames = Storage::get(self::DOWNLOAD_FOLDER.'countryInfo.txt');
            $countryInfoGeoNames = preg_split('/\r\n|\r|\n/', $countryInfoGeoNames);

            //Filter out comments
            $countryInfoGeoNames = array_filter($countryInfoGeoNames, function ($line) {
                return substr($line, 0, 1) !== '#';
            });

            //Filter out incorrect entries (with less then 19 columns)
            $countryInfoGeoNames = array_filter($countryInfoGeoNames, function ($line) {
                return count(preg_split('/\t/', $line)) == 19;
            });

            //Fix the indexes
            $countryInfoGeoNames = array_values($countryInfoGeoNames);

            //Build culture class compatible data
            //First build the country/region arrays
            if ($this->output) {
                $this->output->writeln($this->formatMessage('Transpiling country/region data arrays for '.count($countryInfoGeoNames).' countries / regions...'));
            }
            $cultureDataByLanguages = [];
            foreach ($countryInfoGeoNames as $countryData) {
                $countryData = preg_split('/\t/', $countryData);
                $countryRegionArraysCount = count($countryRegionArrays);

                //Extract region/country array
                $countryRegionArrays[$countryRegionArraysCount] = [
                    'Name' => array_key_exists(4, $countryData) ? $countryData[0] : '',                     //ISO-3166
                    'DisplayName' => array_key_exists(4, $countryData) ? $countryData[4] : '',
                    'NativeName' => '',   //Update me later on in updateCountryRegionArraysWithNativeNames
                    'IsMetric' => true, //Update me later on
                    'TwoLetterISORegionName' => array_key_exists(0, $countryData) ? $countryData[0] : '',   //ISO-3166
                    'ThreeLetterISORegionName' => array_key_exists(1, $countryData) ? $countryData[1] : '', //ISO-3166
                    'CurrencyName' => array_key_exists(11, $countryData) ? $countryData[11] : '',
                    'CurrencyMinorName' => '', //Update me later on
                    'CurrencySymbol' => '',  //Update me later on                                                //ISO-4217
                    'ISOCurrencySymbol' => array_key_exists(10, $countryData) ? $countryData[10] : '',
                    'ISOLanguages' => [],                                                                        //ISO-3166
                    'numberFormatting' => [
                        'currencyDecimalDigits' => 2,       //Update me later on
                        'currencyDecimalSeparator' => '.',  //Update me later on
                        'currencyGroupSeparator' => ' ',    //Update me later on
                        'currencyGroupSizes' => 3,           //Update me later on
                    ],
                ];

                //Extract the languages out of it and put them in
                if (array_key_exists(15, $countryData)) {
                    $languageCodes = explode(',', $countryData[15]);
                    $languageCodes = array_map(function ($languageCode) {
                        $languageCode = trim(strtoupper($languageCode));
                        //if it contains a dash. it is the last value that is the language. The first one is a sub language
                        $parts = explode('-', $languageCode);
                        if (is_array($parts)) {
                            $languageCode = $parts[0];
                        }

                        return $languageCode;
                    }, $languageCodes);
                    $countryRegionArrays[$countryRegionArraysCount]['ISOLanguages'] = $languageCodes;
                }
            }
            if ($this->output) {
                $this->output->writeln($this->formatMessage('Country / region data arrays transpiled.'));
            }
        }

        return $countryRegionArrays;
    }

    /**
     * @param $countryRegionArrays
     * @throws \Illuminate\Contracts\Filesystem\FileNotFoundException
     * @return
     */
    private function updateCountryRegionArraysWithNativeNames($countryRegionArrays)
    {
        if (Storage::exists(self::DOWNLOAD_FOLDER.'wiki_countries_native_names.html')) {
            $countryRegionArraysHtml = Storage::get(self::DOWNLOAD_FOLDER.'wiki_countries_native_names.html');

            $crawler = new Crawler($countryRegionArraysHtml);
            //English name selector (exonym): '.wikitable > tbody > tr > td:nth-child(1) > b > a'
            $names = []; //Structure English name => Native name
            $crawler->filter('.wikitable > tbody > tr')->each(function (Crawler $row, $rowIndex) use ($countryRegionArrays, &$names) {
                $englishName = $row->filter('td:nth-child(1) > b > a');
                $nativeName = $row->filter('td:nth-child(3) > b:first-child');
                if ($nativeName->count() > 0 && $englishName->count() > 0) {
                    preg_match('/^.+?(?=\<[a-zA-Z]+\>)/', $nativeName->html(), $nativeNameText);
                    $nativeNameText = is_array($nativeNameText) && count($nativeNameText) > 0 ? $nativeNameText[0] : $nativeName->text();
                    $nativeNameText = preg_replace('/\n/', '', $nativeNameText);
                    if ($englishName->text() == 'Russia') {
                        $nativeNameText = 'Rossiyab';
                    } //The russia entry is defect. Manually correct it

                    $names[$englishName->text()] = $nativeNameText;
                }
            });

            foreach ($countryRegionArrays as $index => $countryRegionArray) {
                if (isset($names[$countryRegionArray['DisplayName']])) {
                    $countryRegionArrays[$index]['NativeName'] = $names[$countryRegionArray['DisplayName']];
                }
            }
        }

        return $countryRegionArrays;
    }

    /**
     * @param $countryRegionArrays
     * @throws \Illuminate\Contracts\Filesystem\FileNotFoundException
     * @return
     */
    private function updateCountryRegionArraysWithCurrencySymbols($countryRegionArrays)
    {
        if (Storage::exists(self::DOWNLOAD_FOLDER.'wiki_currency_symbols.html')) {
            $currencySymbolsPageHtml = Storage::get(self::DOWNLOAD_FOLDER.'wiki_currency_symbols.html');

            $crawler = new Crawler($currencySymbolsPageHtml);
            $symbols = []; //Structure Curency ISO Code => Symbol
            $crawler->filter('#mw-content-text .wikitable tbody tr')->each(function (Crawler $row, $rowIndex) use ($countryRegionArrays, &$symbols) {
                $currencyIsoCode = $row->filter('td:nth-child(4)');
                $currencySymbol = $row->filter('td:nth-child(3)');
                if ($currencySymbol->count() > 0 && $currencyIsoCode->count() > 0) {
                    if (trim($currencySymbol->text()) !== '(none)' && trim($currencyIsoCode->text()) !== '(none)') {
                        $spaceSplitted = explode(' ', trim($currencySymbol->text()));
                        if (is_array($spaceSplitted)) {
                            $symbol = $spaceSplitted[0];
                        } else {
                            $symbol = $spaceSplitted;
                        }
                        $symbol = preg_replace('/\n/', '', $symbol);

                        $symbols[trim($currencyIsoCode->text())] = $symbol;
                    }
                }
            });

            foreach ($countryRegionArrays as $index => $countryRegionArray) {
                if (isset($symbols[$countryRegionArray['ISOCurrencySymbol']])) {
                    $countryRegionArrays[$index]['CurrencySymbol'] = $symbols[$countryRegionArray['ISOCurrencySymbol']];
                }
            }
        }

        return $countryRegionArrays;
    }

    /**
     * @param $countryRegionArrays
     * @throws \Illuminate\Contracts\Filesystem\FileNotFoundException
     * @return
     */
    private function updateCountryRegionArraysWithMetricationStatus($countryRegionArrays)
    {
        if (Storage::exists(self::DOWNLOAD_FOLDER.'wiki_metrication.html')) {
            $metricationDataPageHtml = Storage::get(self::DOWNLOAD_FOLDER.'wiki_metrication.html');

            $crawler = new Crawler($metricationDataPageHtml);
            $statusses = []; //Country name => Status
            $crawler->filter('.wikitable > tbody tr')->each(function (Crawler $row, $rowIndex) use ($countryRegionArrays, &$statusses) {
                $country = $row->filter('td:nth-child(2)');
                $status = $row->filter('td:nth-child(4)');
                if ($status->count() > 0 && $country->count() > 0) {
                    $status = $status->text();
                    if (strpos($status, 'Almost entirely complete')) {
                        $status = 'Almost entirely complete';
                    } elseif (strpos($status, 'Partially complete')) {
                        $status = 'Partially complete';
                    } elseif (strpos($status, 'Some adoption')) {
                        $status = 'Some adoption';
                    } elseif (strpos($status, 'Complete')) {
                        $status = 'Complete';
                    }

                    $country = trim($country->text());
                    preg_match('/^[\w ]+/', $country, $matches);
                    $country = $matches[0];
                    $statusses[$country] = $status;
                }
            });

            foreach ($countryRegionArrays as $index => $countryRegionArray) {
                $alternativeNames = [
                    //Value from $countryRegionArrays                //Value from wiki_metrication.html
                    'United States'                             =>   'United States of America',
                    'United States Minor Outlying Islands'      =>   'United States of America',
                ];

                $status = 'Complete';
                if (isset($statusses[$countryRegionArray['DisplayName']])) {
                    $status = $statusses[$countryRegionArray['DisplayName']];
                } elseif (isset($alternativeNames[$countryRegionArray['DisplayName']])) {
                    $status = $statusses[$alternativeNames[$countryRegionArray['DisplayName']]];
                }

                $countryRegionArrays[$index]['IsMetric'] = $status == 'Complete' ? true : false;
            }
        }

        return $countryRegionArrays;
    }

    /**
     * @param $countryRegionArrays
     * @throws \Illuminate\Contracts\Filesystem\FileNotFoundException
     * @return
     */
    private function updateCountryRegionArraysWithFormattingAndMajorMinorCurrencyInfo($countryRegionArrays)
    {
        if (Storage::exists(self::DOWNLOAD_FOLDER.'currency_format.html')) {
            $metricationDataPageHtml = Storage::get(self::DOWNLOAD_FOLDER.'currency_format.html');

            $crawler = new Crawler($metricationDataPageHtml);
            $formatInfo = []; //Country name => Status
            $crawler->filter('table.tablestyle tr')->each(function (Crawler $row, $rowIndex) use ($countryRegionArrays, &$statusses, &$formatInfo) {
                $currencyIsoCode = $row->filter('td:nth-child(2)');
                $minorUnit = $row->filter('td:nth-child(5)');
                $format = $row->filter('td:nth-child(7)');

                if ($currencyIsoCode->count() > 0) {
                    //defines some default
                    $formatInfo[$currencyIsoCode->text()] = [
                        'currencyDecimalDigits' => 2,
                        'currencyGroupSizes' => 3,
                        'currencyGroupSeparator' => ',',
                        'currencyDecimalSeparator' => '.',
                        'minor' => 'cents',
                    ];

                    if ($minorUnit->count() > 0) {
                        $minor = preg_replace('/\n/', '', $minorUnit->text());
                        if (str_replace(' ', '', $minor) !== '') {
                            $formatInfo[$currencyIsoCode->text()]['minor'] = $minor;
                        }
                    }

                    //Format example "#,###.##"
                    if ($format->count() > 0) {
                        $parts = explode('#', $format->text());
                        if (is_array($parts) && $parts[0] !== $format->text()) {
                            $groupSizes = 0;
                            $decimals = 0;
                            $decimalSeparator = null;
                            $groupSeparator = null;
                            $partsReverse = array_reverse($parts);
                            foreach ($partsReverse as $part) {
                                if ($part == '') {
                                    if (! $decimalSeparator) {
                                        $decimals++;
                                    } elseif (! $groupSeparator) {
                                        $groupSizes++;
                                    }
                                } else {
                                    if (! $decimalSeparator && ! $groupSeparator) {
                                        $decimalSeparator = $part;
                                    } elseif ($decimalSeparator && ! $groupSeparator) {
                                        $groupSizes++;
                                        $groupSeparator = $part;
                                    }
                                }
                            }

                            //The formatting info about euro does not seem to be correct. We manually correct it here. If more info seems to be incorrect, we need another datasource.
                            if ($currencyIsoCode->text() === 'EUR') {
                                $groupSeparator = '.';
                                $decimalSeparator = ',';
                            }

                            $formatInfo[$currencyIsoCode->text()]['currencyDecimalDigits'] = $decimals;
                            $formatInfo[$currencyIsoCode->text()]['currencyGroupSizes'] = $groupSizes;
                            $formatInfo[$currencyIsoCode->text()]['currencyGroupSeparator'] = $groupSeparator ?? '';
                            $formatInfo[$currencyIsoCode->text()]['currencyDecimalSeparator'] = $decimalSeparator ?? '';
                        }
                    }
                }
            });

            foreach ($countryRegionArrays as $index => $countryRegionArray) {
                if (isset($formatInfo[$countryRegionArray['ISOCurrencySymbol']])) {
                    $minor = $formatInfo[$countryRegionArray['ISOCurrencySymbol']]['minor'];
                    $countryRegionArrays[$index]['CurrencyMinorName'] = $minor;
                    $countryRegionArrays[$index]['numberFormatting'] = $formatInfo[$countryRegionArray['ISOCurrencySymbol']];
                    unset($countryRegionArrays[$index]['numberFormatting']['minor']);
                }
            }
        }

        return $countryRegionArrays;
    }

    /**
     * @return null
     * @throws \Illuminate\Contracts\Filesystem\FileNotFoundException
     */
    public function transpileLanguageArrays()
    {
        $html = Storage::get(self::DOWNLOAD_FOLDER.'wiki_iso_data.html');
        $crawler = new Crawler();
        $crawler->addHtmlContent($html, 'UTF-8');
        $languageRows = [];
        $crawler->filter('#Table > tbody > tr')->each(function (Crawler $row, $rowIndex) use (&$languageRows) {
            $displayNameTableData = $row->filter('td:nth-child(3) > a');
            $englishNameTableData = $row->filter('td:nth-child(3) > a');
            $nativeNameTableData = $row->filter('td:nth-child(4)');
            $ISO639OneTableData = $row->filter('td:nth-child(5)');
            $ISO639TwoTableData = $row->filter('td:nth-child(6)');
            $ISO639ThreeTableData = $row->filter('td:nth-child(8)');

//            if($displayNameTableData->count() == 1 && $englishNameTableData->count() == 1 && $nativeNameTableData->count() == 1 && $ISO639OneTableData->count() == 1 && $ISO639TwoTableData->count() == 1 && $ISO639ThreeTableData->count() == 1) {
            if ($displayNameTableData->count() == 1 && $englishNameTableData->count() == 1) {
                preg_match('/^\w+/u', $ISO639ThreeTableData->text(), $iso3);
                $iso3 = (is_array($iso3) && isset($iso3[0])) ? $iso3[0] : '';
                preg_match('/^.+?(?=,)/u', $nativeNameTableData->text(), $nativeNameFromCommaSeperatedList);
                $nativeName = (is_array($nativeNameFromCommaSeperatedList) && isset($nativeNameFromCommaSeperatedList[0])) ? $nativeNameFromCommaSeperatedList[0] : $nativeNameTableData->text();
                $nativeName = mb_convert_case($nativeName, MB_CASE_TITLE);

                $languageRows[] = [
                    'DisplayName' => preg_replace('/\n/u', '', $displayNameTableData->text()),
                    'EnglishName' => preg_replace('/\n/u', '', $englishNameTableData->text()),
                    'NativeName' => $nativeName,
                    'ISO-639-1' => $ISO639OneTableData->text(),
                    'ISO-639-2' => $ISO639TwoTableData->text(),
                    'ISO-639-3' => $iso3,
                ];
            }
        });

        return $languageRows;
    }

    private function filesToDownload()
    {
        return [
            'countryInfo.txt' => 'https://download.geonames.org/export/dump/countryInfo.txt',
            'wiki_iso_data.html' => 'https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes',
            'wiki_countries_native_names.html' => 'https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_and_their_capitals_in_native_languages',
            'wiki_currency_symbols.html' => 'https://en.wikipedia.org/wiki/List_of_circulating_currencies',
            'wiki_metrication.html' => 'https://en.wikipedia.org/wiki/Metrication',
            'currency_format.html' => 'https://www.thefinancials.com/Default.aspx?SubSectionID=curformat',
            'countries_by_continent' => 'https://simple.wikipedia.org/wiki/List_of_countries_by_continents',
        ];
    }

    /**
     * Set an output where can be logged to.
     *
     * @param OutputStyle $output
     * @return mixed
     */
    public function setOutputStyle(OutputStyle $output)
    {
        $this->output = $output;
    }

    /**
     * @param $message
     * @return string
     */
    private function formatStepMessage($message)
    {
        return self::LOG_PREFIX.' ('.++$this->stepCounter.'/'.count($this->steps).') '.$message;
    }

    /**
     * @param $message
     * @return string
     */
    private function formatMessage($message)
    {
        return self::LOG_PREFIX.' '.$message;
    }
}