<?php
/**
 * ===========================================
 * FLOWBOT DCI - YAHOO SEARCH ADAPTER
 * ===========================================
 * Free web scraping-based Yahoo Search
 * No API key required - parses HTML directly
 *
 * URL Pattern: https://search.yahoo.com/search?p=QUERY&b=OFFSET
 * Rate limit: 2-3 seconds between requests recommended
 */

declare(strict_types=1);

namespace FlowbotDCI\Services\SearchEngine;

class YahooAdapter implements SearchEngineInterface
{
    const NAME = 'Yahoo';
    const VERSION = '1.0';
    const SEARCH_URL = 'https://search.yahoo.com/search';

    private ?string $lastError = null;
    private int $timeout = 15;
    private int $delayMs = 2000; // 2 second delay between requests

    // User-Agent rotation for anti-bot evasion
    private array $userAgents = [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/121.0.0.0 Safari/537.36',
    ];

    public function __construct(array $config = [])
    {
        $this->timeout = $config['timeout'] ?? 15;
        $this->delayMs = $config['delay_ms'] ?? 2000;
    }

    public function getName(): string
    {
        return self::NAME;
    }

    public function isAvailable(): bool
    {
        return true; // Always available (no API key required)
    }

    /**
     * Search Yahoo
     *
     * @param string $query Search query
     * @param int $maxResults Maximum results to return
     * @param int $offset Result offset (for pagination)
     * @return array Search results
     */
    public function search(string $query, int $maxResults = 10, int $offset = 0): array
    {
        $this->lastError = null;
        $results = [];

        try {
            // Build search URL
            $params = [
                'p' => $query,
                'b' => $offset + 1, // Yahoo uses 1-based offset
            ];
            $url = self::SEARCH_URL . '?' . http_build_query($params);

            // Random User-Agent
            $userAgent = $this->userAgents[array_rand($this->userAgents)];

            // cURL request with browser-like headers
            $ch = curl_init();
            curl_setopt_array($ch, [
                CURLOPT_URL => $url,
                CURLOPT_RETURNTRANSFER => true,
                CURLOPT_TIMEOUT => $this->timeout,
                CURLOPT_FOLLOWLOCATION => true,
                CURLOPT_MAXREDIRS => 5,
                CURLOPT_USERAGENT => $userAgent,
                CURLOPT_HTTPHEADER => [
                    'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                    'Accept-Language: en-US,en;q=0.9,pt-BR;q=0.8,pt;q=0.7',
                    'Accept-Encoding: gzip, deflate, br',
                    'Connection: keep-alive',
                    'Upgrade-Insecure-Requests: 1',
                    'Sec-Fetch-Dest: document',
                    'Sec-Fetch-Mode: navigate',
                    'Sec-Fetch-Site: none',
                    'Sec-Fetch-User: ?1',
                    'Cache-Control: max-age=0',
                ],
                CURLOPT_ENCODING => 'gzip, deflate, br',
                CURLOPT_SSL_VERIFYPEER => true,
                CURLOPT_SSL_VERIFYHOST => 2,
                CURLOPT_COOKIEFILE => '', // Enable cookies
                CURLOPT_COOKIEJAR => '',
            ]);

            $response = curl_exec($ch);
            $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
            $error = curl_error($ch);
            curl_close($ch);

            if ($error) {
                $this->lastError = "cURL error: {$error}";
                return [];
            }

            if ($httpCode !== 200) {
                $this->lastError = "HTTP error: {$httpCode}";
                return [];
            }

            if (empty($response)) {
                $this->lastError = "Empty response from Yahoo";
                return [];
            }

            // Parse HTML
            $results = $this->parseResults($response, $maxResults);

            // Rate limiting delay
            if ($this->delayMs > 0) {
                usleep($this->delayMs * 1000);
            }

        } catch (\Exception $e) {
            $this->lastError = $e->getMessage();
            return [];
        }

        return $results;
    }

    /**
     * Parse Yahoo search results HTML
     */
    private function parseResults(string $html, int $maxResults): array
    {
        $results = [];

        // Suppress HTML parsing warnings
        libxml_use_internal_errors(true);

        $dom = new \DOMDocument();
        $dom->loadHTML('<?xml encoding="UTF-8">' . $html, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);

        $xpath = new \DOMXPath($dom);

        // Yahoo result containers - try multiple selectors
        $selectors = [
            '//div[contains(@class, "algo")]',
            '//li[contains(@class, "algo")]',
            '//div[contains(@class, "dd")]//div[contains(@class, "compTitle")]',
            '//div[@id="web"]//div[contains(@class, "Sr")]',
        ];

        $nodes = null;
        foreach ($selectors as $selector) {
            $nodes = $xpath->query($selector);
            if ($nodes && $nodes->length > 0) {
                break;
            }
        }

        if (!$nodes || $nodes->length === 0) {
            // Try alternative parsing for newer Yahoo layout
            $results = $this->parseAlternativeLayout($xpath, $maxResults);
            if (!empty($results)) {
                return $results;
            }
            $this->lastError = "No results found in HTML (selectors not matching)";
            return [];
        }

        $count = 0;
        foreach ($nodes as $node) {
            if ($count >= $maxResults) {
                break;
            }

            $result = $this->parseResultNode($xpath, $node);
            if ($result) {
                $results[] = $result;
                $count++;
            }
        }

        libxml_clear_errors();

        return $results;
    }

    /**
     * Parse a single result node
     */
    private function parseResultNode(\DOMXPath $xpath, \DOMNode $node): ?array
    {
        // Title selectors
        $titleSelectors = [
            './/h3//a',
            './/h4//a',
            './/a[contains(@class, "ac-algo")]',
            './/a[contains(@class, "fz-l")]',
            './/a[@href]//span',
        ];

        $title = null;
        $url = null;

        foreach ($titleSelectors as $selector) {
            $titleNodes = $xpath->query($selector, $node);
            if ($titleNodes && $titleNodes->length > 0) {
                $titleNode = $titleNodes->item(0);
                $title = trim($titleNode->textContent);

                // Get URL from the link
                if ($titleNode->hasAttribute('href')) {
                    $url = $titleNode->getAttribute('href');
                } elseif ($titleNode->parentNode && $titleNode->parentNode->hasAttribute('href')) {
                    $url = $titleNode->parentNode->getAttribute('href');
                }

                if ($title && $url) {
                    break;
                }
            }
        }

        // URL fallback selectors
        if (!$url) {
            $urlSelectors = [
                './/span[contains(@class, "compUrl")]//cite',
                './/cite',
                './/span[contains(@class, "fz-ms")]',
                './/a[contains(@class, "ac-algo")]/@href',
            ];

            foreach ($urlSelectors as $selector) {
                $urlNodes = $xpath->query($selector, $node);
                if ($urlNodes && $urlNodes->length > 0) {
                    $urlNode = $urlNodes->item(0);
                    $url = trim($urlNode->textContent);
                    if ($url) {
                        break;
                    }
                }
            }
        }

        // Extract actual URL from Yahoo redirect
        $url = $this->extractRealUrl($url);

        // Snippet selectors
        $snippetSelectors = [
            './/p[contains(@class, "compText")]',
            './/div[contains(@class, "compText")]',
            './/span[contains(@class, "fc-falcon")]',
            './/p',
        ];

        $snippet = '';
        foreach ($snippetSelectors as $selector) {
            $snippetNodes = $xpath->query($selector, $node);
            if ($snippetNodes && $snippetNodes->length > 0) {
                $snippet = trim($snippetNodes->item(0)->textContent);
                if ($snippet) {
                    break;
                }
            }
        }

        // Validate result
        if (empty($title) || empty($url)) {
            return null;
        }

        // Skip invalid URLs
        if (!filter_var($url, FILTER_VALIDATE_URL)) {
            // Try to fix URL
            if (strpos($url, 'http') !== 0) {
                $url = 'https://' . ltrim($url, '/');
            }
            if (!filter_var($url, FILTER_VALIDATE_URL)) {
                return null;
            }
        }

        // Skip Yahoo internal links
        if (strpos($url, 'yahoo.com') !== false &&
            strpos($url, 'search.yahoo.com') === false) {
            // Allow yahoo.com results that are not search pages
            if (strpos($url, '/search') !== false) {
                return null;
            }
        }

        return [
            'url' => $url,
            'title' => html_entity_decode($title, ENT_QUOTES, 'UTF-8'),
            'snippet' => html_entity_decode($snippet, ENT_QUOTES, 'UTF-8'),
            'source' => self::NAME,
        ];
    }

    /**
     * Parse alternative Yahoo layout (newer design)
     */
    private function parseAlternativeLayout(\DOMXPath $xpath, int $maxResults): array
    {
        $results = [];

        // Try to find all links that look like search results
        $links = $xpath->query('//a[contains(@href, "RU=http")]');

        if (!$links || $links->length === 0) {
            // Try another pattern
            $links = $xpath->query('//div[@id="web"]//a[@href]');
        }

        $count = 0;
        $seenUrls = [];

        foreach ($links as $link) {
            if ($count >= $maxResults) {
                break;
            }

            $href = $link->getAttribute('href');
            $title = trim($link->textContent);

            if (empty($title) || strlen($title) < 5) {
                continue;
            }

            // Extract real URL
            $url = $this->extractRealUrl($href);

            if (empty($url) || !filter_var($url, FILTER_VALIDATE_URL)) {
                continue;
            }

            // Skip duplicates
            $normalizedUrl = strtolower(rtrim($url, '/'));
            if (isset($seenUrls[$normalizedUrl])) {
                continue;
            }
            $seenUrls[$normalizedUrl] = true;

            // Skip Yahoo internal links
            if (strpos($url, 'yahoo.com') !== false) {
                continue;
            }

            $results[] = [
                'url' => $url,
                'title' => html_entity_decode($title, ENT_QUOTES, 'UTF-8'),
                'snippet' => '',
                'source' => self::NAME,
            ];
            $count++;
        }

        return $results;
    }

    /**
     * Extract real URL from Yahoo redirect URL
     */
    private function extractRealUrl(?string $url): ?string
    {
        if (empty($url)) {
            return null;
        }

        // Yahoo uses redirect URLs like:
        // https://r.search.yahoo.com/_ylt=xxx;_ylu=xxx/RU=https%3A%2F%2Fexample.com%2F/RK=xxx/RS=xxx
        if (preg_match('/RU=([^\/]+)/', $url, $matches)) {
            $decoded = urldecode($matches[1]);
            if (filter_var($decoded, FILTER_VALIDATE_URL)) {
                return $decoded;
            }
        }

        // Check if it's already a valid URL
        if (filter_var($url, FILTER_VALIDATE_URL)) {
            return $url;
        }

        return null;
    }

    public function getLastError(): ?string
    {
        return $this->lastError;
    }
}
