<?php
/**
 * ============================================
 * FLOWBOT DCI - SITEMAP PARSER v1.0
 * ============================================
 * Parses XML sitemaps and sitemap indexes.
 *
 * Features:
 * - Standard sitemap parsing
 * - Sitemap index support
 * - Gzip compressed sitemaps
 * - URL extraction with metadata
 * - Priority and lastmod extraction
 * - Recursive index parsing
 * ============================================
 */

declare(strict_types=1);

namespace FlowbotDCI\Services\Crawler;

class SitemapParser
{
    const VERSION = '1.0';

    /**
     * Maximum sitemaps to process (prevent infinite loops)
     */
    private int $maxSitemaps = 100;

    /**
     * Maximum URLs to extract
     */
    private int $maxUrls = 50000;

    /**
     * User agent for requests
     */
    private string $userAgent = 'FlowbotDCI/1.0';

    /**
     * Request timeout in seconds
     */
    private int $timeout = 30;

    /**
     * Extracted URLs
     */
    private array $urls = [];

    /**
     * Processed sitemap URLs
     */
    private array $processedSitemaps = [];

    /**
     * Set maximum sitemaps to process
     */
    public function setMaxSitemaps(int $max): self
    {
        $this->maxSitemaps = max(1, $max);
        return $this;
    }

    /**
     * Set maximum URLs to extract
     */
    public function setMaxUrls(int $max): self
    {
        $this->maxUrls = max(1, $max);
        return $this;
    }

    /**
     * Set user agent
     */
    public function setUserAgent(string $userAgent): self
    {
        $this->userAgent = $userAgent;
        return $this;
    }

    /**
     * Set request timeout
     */
    public function setTimeout(int $seconds): self
    {
        $this->timeout = max(1, $seconds);
        return $this;
    }

    /**
     * Parse sitemap and return all URLs
     */
    public function parse(string $sitemapUrl): array
    {
        $this->urls = [];
        $this->processedSitemaps = [];

        $this->processSitemap($sitemapUrl);

        return $this->urls;
    }

    /**
     * Parse sitemap and return URLs with metadata
     */
    public function parseWithMetadata(string $sitemapUrl): array
    {
        $this->urls = [];
        $this->processedSitemaps = [];

        $this->processSitemap($sitemapUrl, true);

        return $this->urls;
    }

    /**
     * Process a single sitemap
     */
    private function processSitemap(string $url, bool $withMetadata = false): void
    {
        // Check limits
        if (count($this->processedSitemaps) >= $this->maxSitemaps) {
            return;
        }

        if (count($this->urls) >= $this->maxUrls) {
            return;
        }

        // Avoid reprocessing
        if (isset($this->processedSitemaps[$url])) {
            return;
        }

        $this->processedSitemaps[$url] = true;

        // Fetch sitemap
        $content = $this->fetchSitemap($url);
        if (empty($content)) {
            return;
        }

        // Parse XML
        libxml_use_internal_errors(true);
        $xml = simplexml_load_string($content);
        libxml_clear_errors();

        if (!$xml) {
            return;
        }

        // Check if sitemap index or regular sitemap
        $rootName = $xml->getName();

        if ($rootName === 'sitemapindex') {
            $this->processSitemapIndex($xml, $withMetadata);
        } elseif ($rootName === 'urlset') {
            $this->processUrlset($xml, $withMetadata);
        }
    }

    /**
     * Process sitemap index
     */
    private function processSitemapIndex(\SimpleXMLElement $xml, bool $withMetadata): void
    {
        // Register namespaces
        $namespaces = $xml->getNamespaces(true);

        foreach ($xml->sitemap as $sitemap) {
            $loc = (string) $sitemap->loc;

            if (!empty($loc) && filter_var($loc, FILTER_VALIDATE_URL)) {
                $this->processSitemap($loc, $withMetadata);
            }

            // Check limit
            if (count($this->urls) >= $this->maxUrls) {
                return;
            }
        }
    }

    /**
     * Process urlset (regular sitemap)
     */
    private function processUrlset(\SimpleXMLElement $xml, bool $withMetadata): void
    {
        // Register namespaces
        $namespaces = $xml->getNamespaces(true);

        foreach ($xml->url as $urlElement) {
            $loc = (string) $urlElement->loc;

            if (empty($loc) || !filter_var($loc, FILTER_VALIDATE_URL)) {
                continue;
            }

            if ($withMetadata) {
                $this->urls[] = [
                    'url' => $loc,
                    'lastmod' => $this->parseLastmod($urlElement),
                    'changefreq' => (string) ($urlElement->changefreq ?? ''),
                    'priority' => $this->parsePriority($urlElement),
                    'images' => $this->extractImages($urlElement, $namespaces),
                    'videos' => $this->extractVideos($urlElement, $namespaces),
                    'news' => $this->extractNews($urlElement, $namespaces),
                ];
            } else {
                $this->urls[] = $loc;
            }

            // Check limit
            if (count($this->urls) >= $this->maxUrls) {
                return;
            }
        }
    }

    /**
     * Fetch sitemap content
     */
    private function fetchSitemap(string $url): string
    {
        $context = stream_context_create([
            'http' => [
                'timeout' => $this->timeout,
                'user_agent' => $this->userAgent,
                'follow_location' => true,
                'max_redirects' => 5,
            ],
            'ssl' => [
                'verify_peer' => false,
                'verify_peer_name' => false,
            ],
        ]);

        $content = @file_get_contents($url, false, $context);

        if ($content === false) {
            return '';
        }

        // Handle gzip compressed sitemaps
        if (substr($url, -3) === '.gz' || $this->isGzipped($content)) {
            $content = @gzdecode($content);
            if ($content === false) {
                return '';
            }
        }

        return $content;
    }

    /**
     * Check if content is gzipped
     */
    private function isGzipped(string $content): bool
    {
        return strlen($content) >= 2 && substr($content, 0, 2) === "\x1f\x8b";
    }

    /**
     * Parse lastmod date
     */
    private function parseLastmod(\SimpleXMLElement $element): ?string
    {
        $lastmod = (string) ($element->lastmod ?? '');

        if (empty($lastmod)) {
            return null;
        }

        $timestamp = strtotime($lastmod);
        return $timestamp !== false ? date('Y-m-d H:i:s', $timestamp) : null;
    }

    /**
     * Parse priority value
     */
    private function parsePriority(\SimpleXMLElement $element): float
    {
        $priority = (string) ($element->priority ?? '');

        if (empty($priority)) {
            return 0.5; // Default priority
        }

        $value = (float) $priority;
        return max(0.0, min(1.0, $value));
    }

    /**
     * Extract images from sitemap entry
     */
    private function extractImages(\SimpleXMLElement $element, array $namespaces): array
    {
        $images = [];

        // Check for image namespace
        $imageNs = $namespaces['image'] ?? 'http://www.google.com/schemas/sitemap-image/1.1';

        $element->registerXPathNamespace('image', $imageNs);
        $imageElements = $element->xpath('image:image');

        foreach ($imageElements as $img) {
            $img->registerXPathNamespace('image', $imageNs);

            $loc = $img->xpath('image:loc');
            $title = $img->xpath('image:title');
            $caption = $img->xpath('image:caption');

            if (!empty($loc)) {
                $images[] = [
                    'url' => (string) $loc[0],
                    'title' => !empty($title) ? (string) $title[0] : null,
                    'caption' => !empty($caption) ? (string) $caption[0] : null,
                ];
            }
        }

        return $images;
    }

    /**
     * Extract videos from sitemap entry
     */
    private function extractVideos(\SimpleXMLElement $element, array $namespaces): array
    {
        $videos = [];

        // Check for video namespace
        $videoNs = $namespaces['video'] ?? 'http://www.google.com/schemas/sitemap-video/1.1';

        $element->registerXPathNamespace('video', $videoNs);
        $videoElements = $element->xpath('video:video');

        foreach ($videoElements as $video) {
            $video->registerXPathNamespace('video', $videoNs);

            $contentLoc = $video->xpath('video:content_loc');
            $title = $video->xpath('video:title');
            $description = $video->xpath('video:description');
            $thumbnailLoc = $video->xpath('video:thumbnail_loc');
            $duration = $video->xpath('video:duration');

            if (!empty($contentLoc) || !empty($title)) {
                $videos[] = [
                    'url' => !empty($contentLoc) ? (string) $contentLoc[0] : null,
                    'title' => !empty($title) ? (string) $title[0] : null,
                    'description' => !empty($description) ? (string) $description[0] : null,
                    'thumbnail' => !empty($thumbnailLoc) ? (string) $thumbnailLoc[0] : null,
                    'duration' => !empty($duration) ? (int) $duration[0] : null,
                ];
            }
        }

        return $videos;
    }

    /**
     * Extract news data from sitemap entry
     */
    private function extractNews(\SimpleXMLElement $element, array $namespaces): ?array
    {
        // Check for news namespace
        $newsNs = $namespaces['news'] ?? 'http://www.google.com/schemas/sitemap-news/0.9';

        $element->registerXPathNamespace('news', $newsNs);
        $newsElements = $element->xpath('news:news');

        if (empty($newsElements)) {
            return null;
        }

        $news = $newsElements[0];
        $news->registerXPathNamespace('news', $newsNs);

        $publication = $news->xpath('news:publication');
        $title = $news->xpath('news:title');
        $publicationDate = $news->xpath('news:publication_date');
        $keywords = $news->xpath('news:keywords');

        $pubName = null;
        $pubLanguage = null;

        if (!empty($publication)) {
            $publication[0]->registerXPathNamespace('news', $newsNs);
            $name = $publication[0]->xpath('news:name');
            $language = $publication[0]->xpath('news:language');
            $pubName = !empty($name) ? (string) $name[0] : null;
            $pubLanguage = !empty($language) ? (string) $language[0] : null;
        }

        return [
            'publication_name' => $pubName,
            'publication_language' => $pubLanguage,
            'title' => !empty($title) ? (string) $title[0] : null,
            'publication_date' => !empty($publicationDate) ? (string) $publicationDate[0] : null,
            'keywords' => !empty($keywords) ? (string) $keywords[0] : null,
        ];
    }

    /**
     * Get common sitemap locations for a domain
     */
    public static function getCommonSitemapUrls(string $domain): array
    {
        $baseUrl = rtrim($domain, '/');

        return [
            "{$baseUrl}/sitemap.xml",
            "{$baseUrl}/sitemap_index.xml",
            "{$baseUrl}/sitemap-index.xml",
            "{$baseUrl}/sitemaps/sitemap.xml",
            "{$baseUrl}/sitemap/sitemap.xml",
            "{$baseUrl}/sitemap.xml.gz",
            "{$baseUrl}/sitemap_index.xml.gz",
            "{$baseUrl}/post-sitemap.xml",
            "{$baseUrl}/page-sitemap.xml",
            "{$baseUrl}/category-sitemap.xml",
            "{$baseUrl}/news-sitemap.xml",
            "{$baseUrl}/video-sitemap.xml",
            "{$baseUrl}/image-sitemap.xml",
        ];
    }

    /**
     * Discover sitemaps for a domain
     */
    public function discoverSitemaps(string $domain): array
    {
        $found = [];
        $commonUrls = self::getCommonSitemapUrls($domain);

        foreach ($commonUrls as $url) {
            $context = stream_context_create([
                'http' => [
                    'method' => 'HEAD',
                    'timeout' => 5,
                    'user_agent' => $this->userAgent,
                ],
                'ssl' => [
                    'verify_peer' => false,
                    'verify_peer_name' => false,
                ],
            ]);

            $headers = @get_headers($url, 1, $context);

            if ($headers && strpos($headers[0], '200') !== false) {
                $found[] = $url;
            }
        }

        return $found;
    }

    /**
     * Get statistics about the last parse operation
     */
    public function getStats(): array
    {
        return [
            'total_urls' => count($this->urls),
            'sitemaps_processed' => count($this->processedSitemaps),
            'max_urls' => $this->maxUrls,
            'max_sitemaps' => $this->maxSitemaps,
        ];
    }

    /**
     * Reset parser state
     */
    public function reset(): void
    {
        $this->urls = [];
        $this->processedSitemaps = [];
    }
}
