<?php
/**
 * ===========================================
 * FLOWBOT DCI - SITEMAP FETCHER v1.0
 * ===========================================
 * Fetches and parses XML sitemaps
 *
 * Features:
 * - Parse standard sitemap.xml
 * - Handle sitemap indexes (sitemaps of sitemaps)
 * - Extract URLs with metadata (lastmod, priority, changefreq)
 * - Support for gzip compressed sitemaps
 */

declare(strict_types=1);

namespace FlowbotDCI\Services;

class SitemapFetcher
{
    const VERSION = '1.0';

    private ?string $lastError = null;
    private int $timeout = 30;
    private int $maxUrls = 50000;
    private int $maxDepth = 2; // For sitemap indexes

    public function __construct(array $config = [])
    {
        $this->timeout = $config['timeout'] ?? 30;
        $this->maxUrls = $config['maxUrls'] ?? 50000;
        $this->maxDepth = $config['maxDepth'] ?? 2;
    }

    /**
     * Fetch and parse a sitemap
     *
     * @param string $url Sitemap URL
     * @return array Array of URLs with metadata
     */
    public function fetch(string $url): array
    {
        $this->lastError = null;

        return $this->fetchRecursive($url, 0);
    }

    /**
     * Recursively fetch sitemaps (handles sitemap indexes)
     */
    private function fetchRecursive(string $url, int $depth): array
    {
        if ($depth > $this->maxDepth) {
            return [];
        }

        $xml = $this->fetchUrl($url);
        if ($xml === null) {
            return [];
        }

        // Detect if it's a sitemap index or urlset
        if (strpos($xml, '<sitemapindex') !== false) {
            return $this->parseSitemapIndex($xml, $depth);
        } elseif (strpos($xml, '<urlset') !== false) {
            return $this->parseUrlset($xml);
        } else {
            $this->lastError = "Unknown sitemap format";
            return [];
        }
    }

    /**
     * Fetch URL content
     */
    private function fetchUrl(string $url): ?string
    {
        $ch = curl_init();
        curl_setopt_array($ch, [
            CURLOPT_URL => $url,
            CURLOPT_RETURNTRANSFER => true,
            CURLOPT_TIMEOUT => $this->timeout,
            CURLOPT_FOLLOWLOCATION => true,
            CURLOPT_MAXREDIRS => 3,
            CURLOPT_USERAGENT => 'FlowbotDCI/1.0 Sitemap Fetcher',
            CURLOPT_HTTPHEADER => [
                'Accept: application/xml, text/xml, */*',
                'Accept-Encoding: gzip, deflate',
            ],
            CURLOPT_ENCODING => 'gzip',
        ]);

        $response = curl_exec($ch);
        $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
        $error = curl_error($ch);
        curl_close($ch);

        if ($httpCode !== 200) {
            $this->lastError = "HTTP {$httpCode}: " . ($error ?: 'Failed to fetch sitemap');
            return null;
        }

        if (empty($response)) {
            $this->lastError = "Empty response from {$url}";
            return null;
        }

        return $response;
    }

    /**
     * Parse sitemap index (contains links to other sitemaps)
     */
    private function parseSitemapIndex(string $xml, int $depth): array
    {
        $urls = [];

        libxml_use_internal_errors(true);
        $doc = simplexml_load_string($xml);
        libxml_clear_errors();

        if ($doc === false) {
            $this->lastError = "Failed to parse sitemap index XML";
            return [];
        }

        // Register namespace
        $doc->registerXPathNamespace('sm', 'http://www.sitemaps.org/schemas/sitemap/0.9');

        // Find all sitemap entries
        $sitemaps = $doc->xpath('//sm:sitemap/sm:loc') ?: $doc->xpath('//sitemap/loc');

        if (empty($sitemaps)) {
            // Try without namespace
            foreach ($doc->sitemap as $sitemap) {
                $loc = (string)$sitemap->loc;
                if (!empty($loc)) {
                    $childUrls = $this->fetchRecursive($loc, $depth + 1);
                    $urls = array_merge($urls, $childUrls);

                    if (count($urls) >= $this->maxUrls) {
                        break;
                    }
                }
            }
        } else {
            foreach ($sitemaps as $loc) {
                $childUrls = $this->fetchRecursive((string)$loc, $depth + 1);
                $urls = array_merge($urls, $childUrls);

                if (count($urls) >= $this->maxUrls) {
                    break;
                }
            }
        }

        return array_slice($urls, 0, $this->maxUrls);
    }

    /**
     * Parse URL set (actual URLs)
     */
    private function parseUrlset(string $xml): array
    {
        $urls = [];

        libxml_use_internal_errors(true);
        $doc = simplexml_load_string($xml);
        libxml_clear_errors();

        if ($doc === false) {
            $this->lastError = "Failed to parse sitemap XML";
            return [];
        }

        // Register namespace
        $doc->registerXPathNamespace('sm', 'http://www.sitemaps.org/schemas/sitemap/0.9');

        // Try with namespace first
        $urlNodes = $doc->xpath('//sm:url') ?: [];

        if (empty($urlNodes)) {
            // Try without namespace
            foreach ($doc->url as $urlNode) {
                $url = $this->parseUrlNode($urlNode);
                if ($url) {
                    $urls[] = $url;
                    if (count($urls) >= $this->maxUrls) {
                        break;
                    }
                }
            }
        } else {
            foreach ($urlNodes as $urlNode) {
                $url = $this->parseUrlNode($urlNode);
                if ($url) {
                    $urls[] = $url;
                    if (count($urls) >= $this->maxUrls) {
                        break;
                    }
                }
            }
        }

        return $urls;
    }

    /**
     * Parse a single URL node
     */
    private function parseUrlNode($node): ?array
    {
        $loc = (string)($node->loc ?? '');

        if (empty($loc) || !filter_var($loc, FILTER_VALIDATE_URL)) {
            return null;
        }

        return [
            'url' => $loc,
            'lastmod' => (string)($node->lastmod ?? '') ?: null,
            'changefreq' => (string)($node->changefreq ?? '') ?: null,
            'priority' => (string)($node->priority ?? '') ?: null,
        ];
    }

    /**
     * Fetch just the URLs (without metadata)
     */
    public function fetchUrls(string $url): array
    {
        $results = $this->fetch($url);
        return array_map(fn($r) => $r['url'], $results);
    }

    /**
     * Find sitemap URL from robots.txt
     */
    public function findSitemapFromRobots(string $domain): ?string
    {
        if (!preg_match('#^https?://#', $domain)) {
            $domain = 'https://' . $domain;
        }

        $robotsUrl = rtrim($domain, '/') . '/robots.txt';

        $ch = curl_init();
        curl_setopt_array($ch, [
            CURLOPT_URL => $robotsUrl,
            CURLOPT_RETURNTRANSFER => true,
            CURLOPT_TIMEOUT => 10,
            CURLOPT_USERAGENT => 'FlowbotDCI/1.0',
        ]);

        $response = curl_exec($ch);
        $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
        curl_close($ch);

        if ($httpCode !== 200 || empty($response)) {
            return null;
        }

        // Look for Sitemap: directive
        if (preg_match('/^Sitemap:\s*(.+)$/mi', $response, $matches)) {
            return trim($matches[1]);
        }

        // Try default location
        $defaultSitemap = rtrim($domain, '/') . '/sitemap.xml';
        return $defaultSitemap;
    }

    /**
     * Get last error
     */
    public function getLastError(): ?string
    {
        return $this->lastError;
    }
}
