<?php
/**
 * ===========================================
 * FLOWBOT DCI - BING SCRAPER ADAPTER
 * ===========================================
 * Free web scraping-based Bing Search
 * No API key required - parses HTML directly
 *
 * URL Pattern: https://www.bing.com/search?q=QUERY&first=OFFSET
 * Rate limit: 3-5 seconds between requests recommended
 */

declare(strict_types=1);

namespace FlowbotDCI\Services\SearchEngine;

class BingScraperAdapter implements SearchEngineInterface
{
    const NAME = 'Bing';
    const VERSION = '1.0';
    const SEARCH_URL = 'https://www.bing.com/search';

    private ?string $lastError = null;
    private int $timeout = 15;
    private int $delayMs = 3000; // 3 second delay between requests
    private int $retryCount = 2;

    // User-Agent rotation for anti-bot evasion
    private array $userAgents = [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/121.0.0.0 Safari/537.36',
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
    ];

    public function __construct(array $config = [])
    {
        $this->timeout = $config['timeout'] ?? 15;
        $this->delayMs = $config['delay_ms'] ?? 3000;
        $this->retryCount = $config['retry_count'] ?? 2;
    }

    public function getName(): string
    {
        return self::NAME;
    }

    public function isAvailable(): bool
    {
        return true; // Always available (no API key required)
    }

    /**
     * Search Bing
     *
     * @param string $query Search query
     * @param int $maxResults Maximum results to return
     * @param int $offset Result offset (for pagination)
     * @return array Search results
     */
    public function search(string $query, int $maxResults = 10, int $offset = 0): array
    {
        $this->lastError = null;

        // Retry logic with exponential backoff
        for ($attempt = 0; $attempt <= $this->retryCount; $attempt++) {
            $results = $this->doSearch($query, $maxResults, $offset);

            if (!empty($results)) {
                return $results;
            }

            // If CAPTCHA detected or blocked, wait longer before retry
            if ($this->lastError && (
                strpos($this->lastError, 'CAPTCHA') !== false ||
                strpos($this->lastError, '429') !== false ||
                strpos($this->lastError, '403') !== false
            )) {
                // Exponential backoff: 5s, 10s, 20s
                $waitTime = pow(2, $attempt) * 5;
                sleep($waitTime);
            }
        }

        return [];
    }

    /**
     * Perform the actual search request
     */
    private function doSearch(string $query, int $maxResults, int $offset): array
    {
        $results = [];

        try {
            // Build search URL
            $params = [
                'q' => $query,
                'first' => $offset + 1, // Bing uses 1-based offset
                'count' => min($maxResults, 50), // Max 50 per page
            ];
            $url = self::SEARCH_URL . '?' . http_build_query($params);

            // Random User-Agent
            $userAgent = $this->userAgents[array_rand($this->userAgents)];

            // cURL request with browser-like headers
            $ch = curl_init();
            curl_setopt_array($ch, [
                CURLOPT_URL => $url,
                CURLOPT_RETURNTRANSFER => true,
                CURLOPT_TIMEOUT => $this->timeout,
                CURLOPT_FOLLOWLOCATION => true,
                CURLOPT_MAXREDIRS => 5,
                CURLOPT_USERAGENT => $userAgent,
                CURLOPT_HTTPHEADER => [
                    'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
                    'Accept-Language: en-US,en;q=0.9',
                    'Accept-Encoding: gzip, deflate, br',
                    'Connection: keep-alive',
                    'Upgrade-Insecure-Requests: 1',
                    'Sec-Fetch-Dest: document',
                    'Sec-Fetch-Mode: navigate',
                    'Sec-Fetch-Site: none',
                    'Sec-Fetch-User: ?1',
                    'Sec-Ch-Ua: "Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"',
                    'Sec-Ch-Ua-Mobile: ?0',
                    'Sec-Ch-Ua-Platform: "Windows"',
                    'Cache-Control: max-age=0',
                ],
                CURLOPT_ENCODING => 'gzip, deflate, br',
                CURLOPT_SSL_VERIFYPEER => true,
                CURLOPT_SSL_VERIFYHOST => 2,
                CURLOPT_COOKIEFILE => '', // Enable cookies
                CURLOPT_COOKIEJAR => '',
            ]);

            $response = curl_exec($ch);
            $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
            $error = curl_error($ch);
            curl_close($ch);

            if ($error) {
                $this->lastError = "cURL error: {$error}";
                return [];
            }

            if ($httpCode === 429) {
                $this->lastError = "HTTP 429: Rate limited by Bing";
                return [];
            }

            if ($httpCode === 403) {
                $this->lastError = "HTTP 403: Blocked by Bing (possible CAPTCHA)";
                return [];
            }

            if ($httpCode !== 200) {
                $this->lastError = "HTTP error: {$httpCode}";
                return [];
            }

            if (empty($response)) {
                $this->lastError = "Empty response from Bing";
                return [];
            }

            // Check for CAPTCHA
            if ($this->detectCaptcha($response)) {
                $this->lastError = "CAPTCHA detected - request blocked";
                return [];
            }

            // Parse HTML
            $results = $this->parseResults($response, $maxResults);

            // Rate limiting delay
            if ($this->delayMs > 0) {
                usleep($this->delayMs * 1000);
            }

        } catch (\Exception $e) {
            $this->lastError = $e->getMessage();
            return [];
        }

        return $results;
    }

    /**
     * Detect if Bing is showing a CAPTCHA
     */
    private function detectCaptcha(string $html): bool
    {
        $captchaIndicators = [
            'id="captcha"',
            'class="captcha"',
            'recaptcha',
            'Please verify you are a human',
            'unusual traffic',
            'automated queries',
            'bot detection',
        ];

        $htmlLower = strtolower($html);
        foreach ($captchaIndicators as $indicator) {
            if (strpos($htmlLower, strtolower($indicator)) !== false) {
                return true;
            }
        }

        return false;
    }

    /**
     * Parse Bing search results HTML
     */
    private function parseResults(string $html, int $maxResults): array
    {
        $results = [];

        // Suppress HTML parsing warnings
        libxml_use_internal_errors(true);

        $dom = new \DOMDocument();
        $dom->loadHTML('<?xml encoding="UTF-8">' . $html, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);

        $xpath = new \DOMXPath($dom);

        // Bing result containers - multiple selectors for different layouts
        $selectors = [
            '//li[contains(@class, "b_algo")]',
            '//div[contains(@class, "b_algo")]',
            '//ol[@id="b_results"]//li[contains(@class, "b_algo")]',
        ];

        $nodes = null;
        foreach ($selectors as $selector) {
            $nodes = $xpath->query($selector);
            if ($nodes && $nodes->length > 0) {
                break;
            }
        }

        if (!$nodes || $nodes->length === 0) {
            // Try alternative parsing
            $results = $this->parseAlternativeLayout($xpath, $maxResults);
            if (!empty($results)) {
                return $results;
            }
            $this->lastError = "No results found in HTML (selectors not matching)";
            return [];
        }

        $count = 0;
        foreach ($nodes as $node) {
            if ($count >= $maxResults) {
                break;
            }

            $result = $this->parseResultNode($xpath, $node);
            if ($result) {
                $results[] = $result;
                $count++;
            }
        }

        libxml_clear_errors();

        return $results;
    }

    /**
     * Parse a single result node
     */
    private function parseResultNode(\DOMXPath $xpath, \DOMNode $node): ?array
    {
        // Title selectors
        $titleSelectors = [
            './/h2//a',
            './/h3//a',
            './/a[contains(@class, "tilk")]',
            './/a[@href]',
        ];

        $title = null;
        $url = null;

        foreach ($titleSelectors as $selector) {
            $titleNodes = $xpath->query($selector, $node);
            if ($titleNodes && $titleNodes->length > 0) {
                $titleNode = $titleNodes->item(0);
                $title = trim($titleNode->textContent);

                if ($titleNode->hasAttribute('href')) {
                    $url = $titleNode->getAttribute('href');
                }

                if ($title && $url) {
                    break;
                }
            }
        }

        // URL fallback - look for cite element
        if (!$url || !filter_var($url, FILTER_VALIDATE_URL)) {
            $urlSelectors = [
                './/cite',
                './/div[contains(@class, "b_attribution")]//cite',
            ];

            foreach ($urlSelectors as $selector) {
                $urlNodes = $xpath->query($selector, $node);
                if ($urlNodes && $urlNodes->length > 0) {
                    $citeUrl = trim($urlNodes->item(0)->textContent);
                    if ($citeUrl && strpos($citeUrl, 'http') !== 0) {
                        $citeUrl = 'https://' . $citeUrl;
                    }
                    if (filter_var($citeUrl, FILTER_VALIDATE_URL)) {
                        $url = $citeUrl;
                        break;
                    }
                }
            }
        }

        // Snippet selectors
        $snippetSelectors = [
            './/p',
            './/div[contains(@class, "b_caption")]//p',
            './/div[contains(@class, "b_caption")]',
        ];

        $snippet = '';
        foreach ($snippetSelectors as $selector) {
            $snippetNodes = $xpath->query($selector, $node);
            if ($snippetNodes && $snippetNodes->length > 0) {
                $snippet = trim($snippetNodes->item(0)->textContent);
                if ($snippet && strlen($snippet) > 20) {
                    break;
                }
            }
        }

        // Validate result
        if (empty($title) || empty($url)) {
            return null;
        }

        // Skip invalid URLs
        if (!filter_var($url, FILTER_VALIDATE_URL)) {
            return null;
        }

        // Skip Bing internal links
        if (strpos($url, 'bing.com') !== false ||
            strpos($url, 'microsoft.com/bing') !== false) {
            return null;
        }

        // Skip ad results
        if (strpos($url, 'bing.com/aclick') !== false) {
            return null;
        }

        return [
            'url' => $url,
            'title' => html_entity_decode($title, ENT_QUOTES, 'UTF-8'),
            'snippet' => html_entity_decode($snippet, ENT_QUOTES, 'UTF-8'),
            'source' => self::NAME,
        ];
    }

    /**
     * Parse alternative Bing layout
     */
    private function parseAlternativeLayout(\DOMXPath $xpath, int $maxResults): array
    {
        $results = [];

        // Try to find all result links
        $links = $xpath->query('//ol[@id="b_results"]//h2//a[@href]');

        if (!$links || $links->length === 0) {
            // Try another pattern
            $links = $xpath->query('//div[@id="b_content"]//h2//a[@href]');
        }

        $count = 0;
        $seenUrls = [];

        foreach ($links as $link) {
            if ($count >= $maxResults) {
                break;
            }

            $url = $link->getAttribute('href');
            $title = trim($link->textContent);

            if (empty($title) || strlen($title) < 3) {
                continue;
            }

            if (empty($url) || !filter_var($url, FILTER_VALIDATE_URL)) {
                continue;
            }

            // Skip duplicates
            $normalizedUrl = strtolower(rtrim($url, '/'));
            if (isset($seenUrls[$normalizedUrl])) {
                continue;
            }
            $seenUrls[$normalizedUrl] = true;

            // Skip Bing internal links
            if (strpos($url, 'bing.com') !== false) {
                continue;
            }

            $results[] = [
                'url' => $url,
                'title' => html_entity_decode($title, ENT_QUOTES, 'UTF-8'),
                'snippet' => '',
                'source' => self::NAME,
            ];
            $count++;
        }

        return $results;
    }

    public function getLastError(): ?string
    {
        return $this->lastError;
    }
}
