<?php
/**
 * ===========================================
 * FLOWBOT DCI - DUCKDUCKGO SEARCH ADAPTER
 * ===========================================
 * Free search engine adapter using DuckDuckGo Lite/HTML
 * No API key required - v2.0 with improved resilience
 *
 * Features:
 * - User-Agent rotation
 * - Retry with exponential backoff
 * - Lite and HTML fallback
 * - Rate limiting awareness
 */

declare(strict_types=1);

namespace FlowbotDCI\Services\SearchEngine;

class DuckDuckGoAdapter implements SearchEngineInterface
{
    const NAME = 'DuckDuckGo';
    const VERSION = '2.0';
    const LITE_URL = 'https://lite.duckduckgo.com/lite/';
    const HTML_URL = 'https://html.duckduckgo.com/html/';

    private ?string $lastError = null;
    private int $timeout = 15;
    private int $delayMs = 2000; // 2 second delay
    private int $retryCount = 2;

    // User-Agent rotation for anti-bot evasion
    private array $userAgents = [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/121.0.0.0 Safari/537.36',
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
        'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0',
    ];

    public function __construct(array $config = [])
    {
        $this->timeout = $config['timeout'] ?? 15;
        $this->delayMs = $config['delay_ms'] ?? 2000;
        $this->retryCount = $config['retry_count'] ?? 2;
    }

    public function getName(): string
    {
        return self::NAME;
    }

    public function isAvailable(): bool
    {
        return true; // Always available (no API key required)
    }

    public function search(string $query, int $maxResults = 10, int $offset = 0): array
    {
        $this->lastError = null;

        // Try with retries
        for ($attempt = 0; $attempt <= $this->retryCount; $attempt++) {
            // Try lite version first (more reliable for scraping)
            $results = $this->searchLite($query, $maxResults);

            if (!empty($results)) {
                return $results;
            }

            // Fallback to HTML version
            $results = $this->searchHtml($query, $maxResults);

            if (!empty($results)) {
                return $results;
            }

            // If blocked, wait before retry
            if ($this->lastError && (
                strpos($this->lastError, '403') !== false ||
                strpos($this->lastError, '429') !== false ||
                strpos($this->lastError, 'blocked') !== false
            )) {
                // Exponential backoff: 3s, 6s, 12s
                $waitTime = pow(2, $attempt) * 3;
                sleep($waitTime);
            }
        }

        return [];
    }

    /**
     * Search using DuckDuckGo Lite (text-only, more reliable)
     */
    private function searchLite(string $query, int $maxResults): array
    {
        $results = [];

        try {
            $url = self::LITE_URL . '?q=' . urlencode($query);
            $userAgent = $this->userAgents[array_rand($this->userAgents)];

            $ch = curl_init();
            curl_setopt_array($ch, [
                CURLOPT_URL => $url,
                CURLOPT_RETURNTRANSFER => true,
                CURLOPT_TIMEOUT => $this->timeout,
                CURLOPT_FOLLOWLOCATION => true,
                CURLOPT_MAXREDIRS => 5,
                CURLOPT_USERAGENT => $userAgent,
                CURLOPT_HTTPHEADER => [
                    'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                    'Accept-Language: en-US,en;q=0.9',
                    'Accept-Encoding: gzip, deflate, br',
                    'Connection: keep-alive',
                    'Upgrade-Insecure-Requests: 1',
                    'Sec-Fetch-Dest: document',
                    'Sec-Fetch-Mode: navigate',
                    'Sec-Fetch-Site: none',
                    'Cache-Control: max-age=0',
                ],
                CURLOPT_SSL_VERIFYPEER => true,
                CURLOPT_SSL_VERIFYHOST => 2,
                CURLOPT_ENCODING => 'gzip, deflate, br',
                CURLOPT_COOKIEFILE => '',
                CURLOPT_COOKIEJAR => '',
            ]);

            $html = curl_exec($ch);
            $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
            $error = curl_error($ch);
            curl_close($ch);

            if ($error) {
                // Silent fail, try HTML version
                return [];
            }

            if ($httpCode === 403 || $httpCode === 429) {
                $this->lastError = "HTTP {$httpCode}: Rate limited or blocked";
                return [];
            }

            if ($httpCode !== 200 || empty($html)) {
                return [];
            }

            // Check if blocked
            if ($this->isBlocked($html)) {
                $this->lastError = "Request blocked by DuckDuckGo";
                return [];
            }

            $results = $this->parseLiteResults($html, $maxResults);

            // Rate limiting delay
            if ($this->delayMs > 0 && !empty($results)) {
                usleep($this->delayMs * 1000);
            }

        } catch (\Exception $e) {
            // Silent fail, let fallback try
        }

        return $results;
    }

    /**
     * Search using DuckDuckGo HTML (fallback)
     */
    private function searchHtml(string $query, int $maxResults): array
    {
        $results = [];

        try {
            $params = ['q' => $query];
            $userAgent = $this->userAgents[array_rand($this->userAgents)];

            $ch = curl_init();
            curl_setopt_array($ch, [
                CURLOPT_URL => self::HTML_URL,
                CURLOPT_POST => true,
                CURLOPT_POSTFIELDS => http_build_query($params),
                CURLOPT_RETURNTRANSFER => true,
                CURLOPT_TIMEOUT => $this->timeout,
                CURLOPT_FOLLOWLOCATION => true,
                CURLOPT_MAXREDIRS => 5,
                CURLOPT_USERAGENT => $userAgent,
                CURLOPT_HTTPHEADER => [
                    'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                    'Accept-Language: en-US,en;q=0.9',
                    'Accept-Encoding: gzip, deflate, br',
                    'Content-Type: application/x-www-form-urlencoded',
                    'Origin: https://html.duckduckgo.com',
                    'Referer: https://html.duckduckgo.com/',
                    'Connection: keep-alive',
                    'Upgrade-Insecure-Requests: 1',
                ],
                CURLOPT_SSL_VERIFYPEER => true,
                CURLOPT_SSL_VERIFYHOST => 2,
                CURLOPT_ENCODING => 'gzip, deflate, br',
                CURLOPT_COOKIEFILE => '',
                CURLOPT_COOKIEJAR => '',
            ]);

            $html = curl_exec($ch);
            $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
            $error = curl_error($ch);
            curl_close($ch);

            if ($error) {
                $this->lastError = "cURL error: {$error}";
                return [];
            }

            if ($httpCode === 403 || $httpCode === 429) {
                $this->lastError = "HTTP {$httpCode}: Rate limited or blocked";
                return [];
            }

            if ($httpCode !== 200 || empty($html)) {
                $this->lastError = "HTTP {$httpCode}: Empty response";
                return [];
            }

            // Check if blocked
            if ($this->isBlocked($html)) {
                $this->lastError = "Request blocked by DuckDuckGo";
                return [];
            }

            $results = $this->parseHtmlResults($html, $maxResults);

            // Rate limiting delay
            if ($this->delayMs > 0 && !empty($results)) {
                usleep($this->delayMs * 1000);
            }

        } catch (\Exception $e) {
            $this->lastError = $e->getMessage();
        }

        return $results;
    }

    /**
     * Check if the response indicates we're blocked
     */
    private function isBlocked(string $html): bool
    {
        $blockIndicators = [
            'robot',
            'captcha',
            'blocked',
            'unusual traffic',
            'automated',
            'too many requests',
            'rate limit',
        ];

        $htmlLower = strtolower($html);
        foreach ($blockIndicators as $indicator) {
            if (strpos($htmlLower, $indicator) !== false) {
                // Make sure it's not just content mentioning these words
                // Check for typical block page patterns
                if (preg_match('/<form[^>]*captcha|id="captcha"|class=".*captcha/i', $html)) {
                    return true;
                }
                if (preg_match('/please verify|prove you.*human|not a robot/i', $html)) {
                    return true;
                }
            }
        }

        return false;
    }

    /**
     * Parse DuckDuckGo Lite results
     */
    private function parseLiteResults(string $html, int $maxResults): array
    {
        $results = [];

        libxml_use_internal_errors(true);
        $dom = new \DOMDocument();
        @$dom->loadHTML('<?xml encoding="UTF-8">' . $html, LIBXML_NOERROR | LIBXML_NOWARNING);
        libxml_clear_errors();

        $xpath = new \DOMXPath($dom);

        // Lite version uses simple table rows
        $rows = $xpath->query("//table//tr");

        $count = 0;
        $seenUrls = [];

        foreach ($rows as $row) {
            if ($count >= $maxResults) {
                break;
            }

            // Look for result link
            $links = $xpath->query(".//a[@href]", $row);
            foreach ($links as $link) {
                $href = $link->getAttribute('href');
                $text = trim($link->textContent);

                // Skip navigation and internal links
                if (empty($href) || empty($text) ||
                    strpos($href, 'duckduckgo.com') !== false ||
                    strpos($href, 'javascript:') === 0 ||
                    strlen($text) < 5) {
                    continue;
                }

                // Extract actual URL
                $actualUrl = $this->extractActualUrl($href);
                if ($actualUrl && filter_var($actualUrl, FILTER_VALIDATE_URL)) {
                    // Skip duplicates
                    $normalizedUrl = strtolower(rtrim($actualUrl, '/'));
                    if (isset($seenUrls[$normalizedUrl])) {
                        continue;
                    }
                    $seenUrls[$normalizedUrl] = true;

                    $results[] = [
                        'url' => $actualUrl,
                        'title' => html_entity_decode($text, ENT_QUOTES, 'UTF-8'),
                        'snippet' => '',
                        'source' => self::NAME,
                    ];
                    $count++;
                    break;
                }
            }
        }

        return $results;
    }

    /**
     * Parse DuckDuckGo HTML results
     */
    private function parseHtmlResults(string $html, int $maxResults): array
    {
        $results = [];

        libxml_use_internal_errors(true);
        $dom = new \DOMDocument();
        @$dom->loadHTML('<?xml encoding="UTF-8">' . $html, LIBXML_NOERROR | LIBXML_NOWARNING);
        libxml_clear_errors();

        $xpath = new \DOMXPath($dom);

        // Try multiple selectors for result links
        $selectors = [
            "//a[@class='result__a']",
            "//a[contains(@class, 'result__url')]",
            "//div[contains(@class, 'result')]//h2//a[@href]",
            "//div[contains(@class, 'result')]//a[@class='result__a']",
            "//div[contains(@class, 'links_main')]//a[@href]",
            "//h2//a[@href]",
        ];

        $seenUrls = [];
        $count = 0;

        foreach ($selectors as $selector) {
            $links = $xpath->query($selector);
            if ($links && $links->length > 0) {
                foreach ($links as $link) {
                    if ($count >= $maxResults) {
                        break 2;
                    }

                    $href = $link->getAttribute('href');
                    $title = trim($link->textContent);

                    if (empty($href) || empty($title) || strlen($title) < 3) {
                        continue;
                    }

                    $actualUrl = $this->extractActualUrl($href);
                    if ($actualUrl && filter_var($actualUrl, FILTER_VALIDATE_URL)) {
                        // Skip duplicates
                        $normalizedUrl = strtolower(rtrim($actualUrl, '/'));
                        if (isset($seenUrls[$normalizedUrl])) {
                            continue;
                        }
                        $seenUrls[$normalizedUrl] = true;

                        // Try to get snippet
                        $snippet = '';
                        $parent = $link->parentNode;
                        while ($parent && $parent->nodeName !== 'div') {
                            $parent = $parent->parentNode;
                        }
                        if ($parent) {
                            $snippetNodes = $xpath->query(".//a[@class='result__snippet']", $parent);
                            if ($snippetNodes && $snippetNodes->length > 0) {
                                $snippet = trim($snippetNodes->item(0)->textContent);
                            }
                        }

                        $results[] = [
                            'url' => $actualUrl,
                            'title' => html_entity_decode($title, ENT_QUOTES, 'UTF-8'),
                            'snippet' => html_entity_decode($snippet, ENT_QUOTES, 'UTF-8'),
                            'source' => self::NAME,
                        ];
                        $count++;
                    }
                }
            }
        }

        return $results;
    }

    /**
     * Extract actual URL from DuckDuckGo redirect URL
     */
    private function extractActualUrl(string $href): ?string
    {
        // DuckDuckGo uses //duckduckgo.com/l/?uddg=ENCODED_URL
        if (strpos($href, 'uddg=') !== false) {
            $parsed = parse_url($href);
            if (isset($parsed['query'])) {
                parse_str($parsed['query'], $params);
                if (isset($params['uddg'])) {
                    $decoded = urldecode($params['uddg']);
                    if (filter_var($decoded, FILTER_VALIDATE_URL)) {
                        return $decoded;
                    }
                }
            }
        }

        // Also check for 'u' parameter (alternative format)
        if (strpos($href, '&u=') !== false || strpos($href, '?u=') !== false) {
            $parsed = parse_url($href);
            if (isset($parsed['query'])) {
                parse_str($parsed['query'], $params);
                if (isset($params['u'])) {
                    $decoded = urldecode($params['u']);
                    if (filter_var($decoded, FILTER_VALIDATE_URL)) {
                        return $decoded;
                    }
                }
            }
        }

        // Direct URL
        if (filter_var($href, FILTER_VALIDATE_URL)) {
            // Skip DDG internal links
            if (strpos($href, 'duckduckgo.com') !== false) {
                return null;
            }
            return $href;
        }

        // Protocol-relative
        if (str_starts_with($href, '//')) {
            $fullUrl = 'https:' . $href;
            if (strpos($fullUrl, 'duckduckgo.com') !== false) {
                return null;
            }
            return $fullUrl;
        }

        return null;
    }

    public function getLastError(): ?string
    {
        return $this->lastError;
    }
}
