<?php
/**
 * ============================================
 * FLOWBOT DCI - UNIFIED CRAWLER v7.0
 * ============================================
 * Main orchestrator for unified crawling operations.
 * Combines search engines, deep crawling, sitemap parsing,
 * and infinite mode into a single professional interface.
 *
 * Features:
 * - Multi-engine search (Bing, Yahoo, DuckDuckGo, SearXNG)
 * - Deep BFS crawling with depth control
 * - Sitemap parsing and processing
 * - Infinite/continuous mode
 * - Hybrid mode (search + deep crawl)
 * - 4-phase adaptive processing
 * - Real-time SSE events
 * - Checkpoint/resume capability
 * ============================================
 */

declare(strict_types=1);

namespace FlowbotDCI\Services\Crawler;

use FlowbotDCI\Core\Database;
use FlowbotDCI\Services\WebScraper;
use FlowbotDCI\Services\UrlProcessor;
use FlowbotDCI\Services\ProgressTracker;
use FlowbotDCI\Services\DomainRateLimiter;
use FlowbotDCI\Services\CircuitBreaker;
use FlowbotDCI\Services\SearchEngine\SearchEngineAggregator;
use Generator;
use PDO;

class UnifiedCrawler
{
    const VERSION = '7.0';

    // Crawl modes
    const MODE_DEEP = 'deep';
    const MODE_SEARCH = 'search';
    const MODE_SITEMAP = 'sitemap';
    const MODE_INFINITE = 'infinite';
    const MODE_HYBRID = 'hybrid';

    // Job statuses
    const STATUS_PENDING = 'pending';
    const STATUS_RUNNING = 'running';
    const STATUS_PAUSED = 'paused';
    const STATUS_COMPLETED = 'completed';
    const STATUS_FAILED = 'failed';
    const STATUS_CANCELLED = 'cancelled';

    // Dependencies
    private ?Database $database = null;
    private ?PDO $pdo = null;
    private ?WebScraper $scraper = null;
    private ?UrlProcessor $urlProcessor = null;
    private ?ProgressTracker $progressTracker = null;
    private ?DomainRateLimiter $rateLimiter = null;
    private ?CircuitBreaker $circuitBreaker = null;
    private ?SearchEngineAggregator $searchAggregator = null;
    private ?RelevanceScorer $relevanceScorer = null;
    private ?ContentExtractor $contentExtractor = null;
    private ?DuplicateDetector $duplicateDetector = null;
    private ?RobotsHandler $robotsHandler = null;
    private ?SitemapParser $sitemapParser = null;

    // Configuration
    private array $config = [
        'max_pages' => 100,
        'max_depth' => 3,
        'parallel_count' => 5,
        'timeout' => 10,
        'same_domain_only' => true,
        'relevance_threshold' => 2.0,
        'robots_policy' => 'respect',
        'checkpoint_interval' => 50,
        'batch_size' => 20,
        'rate_limit_delay_ms' => 1000,
    ];

    // Domain lists
    private array $forcedDomains = [];
    private array $blockedDomains = [];
    private array $includePatterns = [];
    private array $excludePatterns = [];

    // State
    private string $jobId = '';
    private string $mode = self::MODE_DEEP;
    private string $status = self::STATUS_PENDING;
    private array $searchTerms = [];
    private array $seedUrls = [];
    private array $visited = [];
    private array $queue = [];
    private int $processedCount = 0;
    private int $importedCount = 0;
    private int $errorCount = 0;
    private int $ignoredCount = 0;
    private float $startTime = 0;
    private bool $stopped = false;
    private bool $paused = false;

    // Event callback
    private $eventCallback = null;

    /**
     * Constructor
     */
    public function __construct()
    {
        $this->jobId = uniqid('crawl_', true);
        $this->relevanceScorer = new RelevanceScorer();
        $this->contentExtractor = new ContentExtractor();
        $this->duplicateDetector = new DuplicateDetector();
    }

    /**
     * Set database connection
     */
    public function setDatabase(Database $database): self
    {
        $this->database = $database;
        $this->pdo = $database->getConnection();
        $this->duplicateDetector->setDatabase($database);
        return $this;
    }

    /**
     * Set web scraper
     */
    public function setScraper(WebScraper $scraper): self
    {
        $this->scraper = $scraper;
        return $this;
    }

    /**
     * Set URL processor
     */
    public function setUrlProcessor(UrlProcessor $processor): self
    {
        $this->urlProcessor = $processor;
        return $this;
    }

    /**
     * Set progress tracker
     */
    public function setProgressTracker(ProgressTracker $tracker): self
    {
        $this->progressTracker = $tracker;
        return $this;
    }

    /**
     * Set rate limiter
     */
    public function setRateLimiter(DomainRateLimiter $limiter): self
    {
        $this->rateLimiter = $limiter;
        return $this;
    }

    /**
     * Set circuit breaker
     */
    public function setCircuitBreaker(CircuitBreaker $breaker): self
    {
        $this->circuitBreaker = $breaker;
        return $this;
    }

    /**
     * Set search engine aggregator
     */
    public function setSearchAggregator(SearchEngineAggregator $aggregator): self
    {
        $this->searchAggregator = $aggregator;
        return $this;
    }

    /**
     * Set robots handler
     */
    public function setRobotsHandler(RobotsHandler $handler): self
    {
        $this->robotsHandler = $handler;
        return $this;
    }

    /**
     * Set sitemap parser
     */
    public function setSitemapParser(SitemapParser $parser): self
    {
        $this->sitemapParser = $parser;
        return $this;
    }

    /**
     * Set event callback for SSE
     */
    public function setEventCallback(callable $callback): self
    {
        $this->eventCallback = $callback;
        return $this;
    }

    /**
     * Configure crawler options
     */
    public function configure(array $options): self
    {
        $this->config = array_merge($this->config, $options);

        // Apply to relevance scorer
        if (isset($options['relevance_threshold'])) {
            $this->relevanceScorer->setThreshold((float) $options['relevance_threshold']);
        }
        if (isset($options['relevance_weights'])) {
            $this->relevanceScorer->setWeights($options['relevance_weights']);
        }

        return $this;
    }

    /**
     * Set forced domains (bypass relevance)
     */
    public function setForcedDomains(array $domains): self
    {
        $this->forcedDomains = array_map('strtolower', $domains);
        $this->relevanceScorer->setForcedDomains($this->forcedDomains);
        return $this;
    }

    /**
     * Set blocked domains
     */
    public function setBlockedDomains(array $domains): self
    {
        $this->blockedDomains = array_map('strtolower', $domains);
        return $this;
    }

    /**
     * Set URL include patterns
     */
    public function setIncludePatterns(array $patterns): self
    {
        $this->includePatterns = $patterns;
        return $this;
    }

    /**
     * Set URL exclude patterns
     */
    public function setExcludePatterns(array $patterns): self
    {
        $this->excludePatterns = $patterns;
        return $this;
    }

    /**
     * Set search terms
     */
    public function setSearchTerms(array $terms): self
    {
        $this->searchTerms = array_filter($terms, fn($t) => !empty(trim($t)));
        return $this;
    }

    /**
     * Set seed URLs
     */
    public function setSeedUrls(array $urls): self
    {
        $this->seedUrls = array_filter($urls, fn($u) => filter_var($u, FILTER_VALIDATE_URL));
        return $this;
    }

    /**
     * Get job ID
     */
    public function getJobId(): string
    {
        return $this->jobId;
    }

    /**
     * Get current status
     */
    public function getStatus(): string
    {
        return $this->status;
    }

    /**
     * Get progress data
     */
    public function getProgress(): array
    {
        $elapsed = $this->startTime > 0 ? microtime(true) - $this->startTime : 0;
        $totalDiscovered = count($this->visited) + count($this->queue);
        $rate = $elapsed > 0 ? round($this->processedCount / $elapsed, 2) : 0;

        return [
            'job_id' => $this->jobId,
            'mode' => $this->mode,
            'status' => $this->status,
            'total_discovered' => $totalDiscovered,
            'processed' => $this->processedCount,
            'imported' => $this->importedCount,
            'errors' => $this->errorCount,
            'ignored' => $this->ignoredCount,
            'queue_size' => count($this->queue),
            'visited_count' => count($this->visited),
            'elapsed_seconds' => round($elapsed, 2),
            'processing_rate' => $rate,
            'memory_usage_mb' => round(memory_get_usage(true) / 1024 / 1024, 2),
        ];
    }

    /**
     * Start search-based crawl
     */
    public function startSearchCrawl(string $query, array $options = []): Generator
    {
        $this->mode = self::MODE_SEARCH;
        $this->setSearchTerms([$query]);
        $this->configure($options);

        yield from $this->run();
    }

    /**
     * Start deep crawl from URL
     */
    public function startDeepCrawl(string $url, array $options = []): Generator
    {
        $this->mode = self::MODE_DEEP;
        $this->setSeedUrls([$url]);
        $this->configure($options);

        yield from $this->run();
    }

    /**
     * Start sitemap crawl
     */
    public function startSitemapCrawl(string $sitemapUrl, array $options = []): Generator
    {
        $this->mode = self::MODE_SITEMAP;
        $this->setSeedUrls([$sitemapUrl]);
        $this->configure($options);

        yield from $this->run();
    }

    /**
     * Start infinite mode
     */
    public function startInfiniteMode(string $query, array $options = []): Generator
    {
        $this->mode = self::MODE_INFINITE;
        $this->setSearchTerms([$query]);
        $options['max_pages'] = PHP_INT_MAX;
        $this->configure($options);

        yield from $this->run();
    }

    /**
     * Start hybrid crawl (search + deep)
     */
    public function startHybridCrawl(array $seeds, string $searchTerm, array $options = []): Generator
    {
        $this->mode = self::MODE_HYBRID;
        $this->setSeedUrls($seeds);
        $this->setSearchTerms([$searchTerm]);
        $this->configure($options);

        yield from $this->run();
    }

    /**
     * Pause crawling
     */
    public function pause(): void
    {
        $this->paused = true;
        $this->status = self::STATUS_PAUSED;
        $this->emit('paused', ['job_id' => $this->jobId]);
        $this->saveCheckpoint();
    }

    /**
     * Resume crawling
     */
    public function resume(): void
    {
        $this->paused = false;
        $this->status = self::STATUS_RUNNING;
        $this->emit('resumed', ['job_id' => $this->jobId]);
    }

    /**
     * Stop crawling
     */
    public function stop(): void
    {
        $this->stopped = true;
        $this->status = self::STATUS_CANCELLED;
        $this->emit('stopped', ['job_id' => $this->jobId]);
    }

    /**
     * Main crawl execution
     */
    private function run(): Generator
    {
        $this->startTime = microtime(true);
        $this->status = self::STATUS_RUNNING;

        // Create job record
        $this->createJobRecord();

        $this->emit('started', [
            'job_id' => $this->jobId,
            'mode' => $this->mode,
            'config' => $this->config,
        ]);

        try {
            // Initialize queue based on mode
            yield from $this->initializeQueue();

            // Process queue
            yield from $this->processQueue();

            // Complete
            $this->status = self::STATUS_COMPLETED;
            $this->updateJobRecord();

            $this->emit('completed', $this->getProgress());

        } catch (\Exception $e) {
            $this->status = self::STATUS_FAILED;
            $this->updateJobRecord($e->getMessage());

            $this->emit('error', [
                'job_id' => $this->jobId,
                'message' => $e->getMessage(),
            ]);

            throw $e;
        }

        yield $this->getProgress();
    }

    /**
     * Initialize queue based on crawl mode
     */
    private function initializeQueue(): Generator
    {
        switch ($this->mode) {
            case self::MODE_SEARCH:
            case self::MODE_INFINITE:
                yield from $this->initializeFromSearch();
                break;

            case self::MODE_SITEMAP:
                yield from $this->initializeFromSitemap();
                break;

            case self::MODE_HYBRID:
                yield from $this->initializeFromSearch();
                yield from $this->initializeFromSeeds();
                break;

            case self::MODE_DEEP:
            default:
                yield from $this->initializeFromSeeds();
                break;
        }
    }

    /**
     * Initialize queue from search results
     */
    private function initializeFromSearch(): Generator
    {
        if (!$this->searchAggregator || empty($this->searchTerms)) {
            return;
        }

        foreach ($this->searchTerms as $term) {
            $this->emit('search_started', ['term' => $term]);

            try {
                $results = $this->searchAggregator->search($term, [
                    'max_results' => min($this->config['max_pages'], 100),
                ]);

                foreach ($results as $result) {
                    $url = $result['url'] ?? $result['link'] ?? null;
                    if ($url && $this->shouldCrawl($url, 0)) {
                        $this->addToQueue($url, 0);
                    }
                }

                $this->emit('search_completed', [
                    'term' => $term,
                    'results_count' => count($results),
                ]);

            } catch (\Exception $e) {
                $this->emit('search_error', [
                    'term' => $term,
                    'error' => $e->getMessage(),
                ]);
            }

            yield $this->getProgress();
        }
    }

    /**
     * Initialize queue from seed URLs
     */
    private function initializeFromSeeds(): Generator
    {
        foreach ($this->seedUrls as $url) {
            if ($this->shouldCrawl($url, 0)) {
                $this->addToQueue($url, 0);
            }
        }
        yield $this->getProgress();
    }

    /**
     * Initialize queue from sitemap
     */
    private function initializeFromSitemap(): Generator
    {
        if (!$this->sitemapParser || empty($this->seedUrls)) {
            return;
        }

        foreach ($this->seedUrls as $sitemapUrl) {
            $this->emit('sitemap_started', ['url' => $sitemapUrl]);

            try {
                $urls = $this->sitemapParser->parse($sitemapUrl);

                foreach ($urls as $url) {
                    if ($this->shouldCrawl($url, 0)) {
                        $this->addToQueue($url, 0);
                    }
                }

                $this->emit('sitemap_completed', [
                    'url' => $sitemapUrl,
                    'urls_count' => count($urls),
                ]);

            } catch (\Exception $e) {
                $this->emit('sitemap_error', [
                    'url' => $sitemapUrl,
                    'error' => $e->getMessage(),
                ]);
            }

            yield $this->getProgress();
        }
    }

    /**
     * Process the URL queue
     */
    private function processQueue(): Generator
    {
        $batchSize = $this->config['batch_size'];
        $checkpointInterval = $this->config['checkpoint_interval'];
        $lastCheckpoint = 0;

        while (!empty($this->queue) && !$this->stopped) {
            // Handle pause
            while ($this->paused && !$this->stopped) {
                usleep(100000); // 100ms
            }

            if ($this->stopped) break;

            // Check max pages
            if ($this->processedCount >= $this->config['max_pages']) {
                $this->emit('max_pages_reached', ['count' => $this->processedCount]);
                break;
            }

            // Get batch from queue
            $batch = array_splice($this->queue, 0, $batchSize);

            // Fetch batch
            $results = $this->fetchBatch($batch);

            // Process each result
            foreach ($results as $item) {
                yield from $this->processItem($item);
            }

            // Checkpoint
            if ($this->processedCount - $lastCheckpoint >= $checkpointInterval) {
                $this->saveCheckpoint();
                $lastCheckpoint = $this->processedCount;
            }

            // Memory management
            if ($this->processedCount % 100 === 0) {
                gc_collect_cycles();
            }

            yield $this->getProgress();
        }
    }

    /**
     * Fetch a batch of URLs
     */
    private function fetchBatch(array $batch): array
    {
        if (!$this->scraper) {
            return [];
        }

        $urls = array_column($batch, 'url');
        $responses = $this->scraper->fetchBatch($urls, [
            'timeout' => $this->config['timeout'],
            'parallel' => $this->config['parallel_count'],
        ]);

        $results = [];
        foreach ($batch as $index => $item) {
            $url = $item['url'];
            $results[] = [
                'url' => $url,
                'depth' => $item['depth'],
                'html' => $responses[$url]['html'] ?? '',
                'error' => $responses[$url]['error'] ?? null,
                'http_code' => $responses[$url]['http_code'] ?? 0,
                'response_time' => $responses[$url]['response_time'] ?? 0,
            ];
        }

        return $results;
    }

    /**
     * Process a single crawl item
     */
    private function processItem(array $item): Generator
    {
        $url = $item['url'];
        $depth = $item['depth'];
        $html = $item['html'];
        $error = $item['error'];

        $this->processedCount++;
        $this->visited[$url] = true;

        // Record domain statistics
        $this->recordDomainStats($url, $item);

        // Handle errors
        if ($error || empty($html)) {
            $this->errorCount++;
            $this->emit('url_error', [
                'url' => $url,
                'error' => $error ?? 'Empty response',
            ]);
            return;
        }

        // Parse HTML
        $dom = new \DOMDocument();
        libxml_use_internal_errors(true);
        @$dom->loadHTML('<?xml encoding="UTF-8">' . $html, LIBXML_NOERROR);
        libxml_clear_errors();

        // Check relevance
        if (!empty($this->searchTerms)) {
            if (!$this->relevanceScorer->isRelevant($dom, $url, $this->searchTerms)) {
                $this->ignoredCount++;
                $this->emit('url_irrelevant', ['url' => $url]);
                return;
            }
        }

        // Extract content
        $metadata = $this->contentExtractor->extract($html, $url);

        // Check for duplicate content
        if ($this->duplicateDetector->isDuplicateContent($html)) {
            $this->ignoredCount++;
            $this->emit('url_duplicate', ['url' => $url]);
            return;
        }

        // Import to pinfeeds
        if ($this->importToPinfeeds($url, $metadata)) {
            $this->importedCount++;
            $this->emit('url_imported', [
                'url' => $url,
                'title' => $metadata['title'],
            ]);

            // Record content hash
            $this->duplicateDetector->recordContent(
                $url,
                $html,
                $metadata['title'],
                $this->jobId
            );
        }

        // Extract links for deep crawling
        if ($depth < $this->config['max_depth']) {
            $links = $this->extractLinks($dom, $url);
            foreach ($links as $link) {
                if ($this->shouldCrawl($link, $depth + 1)) {
                    $this->addToQueue($link, $depth + 1);
                }
            }
        }

        yield $this->getProgress();
    }

    /**
     * Check if URL should be crawled
     */
    private function shouldCrawl(string $url, int $depth): bool
    {
        // Already visited
        if (isset($this->visited[$url])) {
            return false;
        }

        // Already in queue
        foreach ($this->queue as $item) {
            if ($item['url'] === $url) {
                return false;
            }
        }

        // Invalid URL
        if (!filter_var($url, FILTER_VALIDATE_URL)) {
            return false;
        }

        // Check depth
        if ($depth > $this->config['max_depth']) {
            return false;
        }

        // Check blocked domains
        $host = parse_url($url, PHP_URL_HOST);
        if ($host) {
            $host = strtolower($host);
            foreach ($this->blockedDomains as $blocked) {
                if (strpos($host, $blocked) !== false) {
                    return false;
                }
            }
        }

        // Check same domain restriction
        if ($this->config['same_domain_only'] && !empty($this->seedUrls)) {
            $seedHost = parse_url($this->seedUrls[0], PHP_URL_HOST);
            if ($host !== $seedHost) {
                return false;
            }
        }

        // Check include patterns
        if (!empty($this->includePatterns)) {
            $matches = false;
            foreach ($this->includePatterns as $pattern) {
                if (preg_match($pattern, $url)) {
                    $matches = true;
                    break;
                }
            }
            if (!$matches) {
                return false;
            }
        }

        // Check exclude patterns
        foreach ($this->excludePatterns as $pattern) {
            if (preg_match($pattern, $url)) {
                return false;
            }
        }

        // Check robots.txt
        if ($this->config['robots_policy'] === 'respect' && $this->robotsHandler) {
            if (!$this->robotsHandler->isAllowed($url)) {
                return false;
            }
        }

        // Check if already in database
        if ($this->duplicateDetector->existsInDatabase($url)) {
            return false;
        }

        return true;
    }

    /**
     * Add URL to queue
     */
    private function addToQueue(string $url, int $depth): void
    {
        $this->queue[] = [
            'url' => $url,
            'depth' => $depth,
        ];
    }

    /**
     * Extract links from DOM
     */
    private function extractLinks(\DOMDocument $dom, string $baseUrl): array
    {
        $links = [];
        $anchors = $dom->getElementsByTagName('a');
        $baseParts = parse_url($baseUrl);
        $baseScheme = $baseParts['scheme'] ?? 'https';
        $baseHost = $baseParts['host'] ?? '';

        foreach ($anchors as $anchor) {
            $href = $anchor->getAttribute('href');
            if (empty($href)) continue;

            // Skip non-http links
            if (preg_match('/^(mailto:|javascript:|tel:|#)/', $href)) {
                continue;
            }

            // Resolve relative URLs
            if (strpos($href, '//') === 0) {
                $href = $baseScheme . ':' . $href;
            } elseif (strpos($href, '/') === 0) {
                $href = "{$baseScheme}://{$baseHost}{$href}";
            } elseif (!preg_match('/^https?:\/\//', $href)) {
                $href = "{$baseScheme}://{$baseHost}/" . ltrim($href, '/');
            }

            if (filter_var($href, FILTER_VALIDATE_URL)) {
                $links[] = $href;
            }
        }

        return array_unique($links);
    }

    /**
     * Import data to pinfeeds table
     */
    private function importToPinfeeds(string $url, array $metadata): bool
    {
        if (!$this->pdo) {
            return false;
        }

        try {
            $stmt = $this->pdo->prepare("
                INSERT INTO pinfeeds (
                    link, title, description, thumbnail, favicon,
                    embed_code, author, tags, source_website, pubDate, created_at
                ) VALUES (
                    ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, NOW()
                )
            ");

            $host = parse_url($url, PHP_URL_HOST);
            $tags = is_array($metadata['tags']) ? implode(',', $metadata['tags']) : '';
            $pubDate = $metadata['published_date'] ?? date('Y-m-d H:i:s');

            $stmt->execute([
                $url,
                $metadata['title'] ?? 'No title',
                $metadata['description'] ?? 'No description',
                $metadata['thumbnail'] ?? 'fallback_image.jpg',
                $metadata['favicon'] ?? 'default_favicon.ico',
                $metadata['embed_code'] ?? null,
                $metadata['author'] ?? $this->generateRandomAuthor(),
                $tags,
                $host,
                $pubDate,
            ]);

            return true;

        } catch (\Exception $e) {
            // Duplicate key is expected
            if (strpos($e->getMessage(), 'Duplicate') === false) {
                error_log("UnifiedCrawler::importToPinfeeds error: " . $e->getMessage());
            }
            return false;
        }
    }

    /**
     * Generate random author name
     */
    private function generateRandomAuthor(): string
    {
        $names = [
            'Carlos Dias', 'Mariana Silva', 'John Santos', 'Ana Monteiro',
            'Pedro Correia', 'Maria Almeida', 'Sofia Nunes', 'Rodrigo Azevedo',
            'Luciana Araujo', 'Felipe Santana', 'James Wilson', 'Emily Brown',
        ];
        return $names[array_rand($names)];
    }

    /**
     * Record domain statistics
     */
    private function recordDomainStats(string $url, array $item): void
    {
        if (!$this->pdo) {
            return;
        }

        $host = parse_url($url, PHP_URL_HOST);
        if (!$host) {
            return;
        }

        $success = empty($item['error']) && !empty($item['html']);
        $responseTime = $item['response_time'] ?? 0;
        $httpCode = $item['http_code'] ?? 0;
        $bytesDownloaded = strlen($item['html'] ?? '');

        try {
            $stmt = $this->pdo->prepare("CALL update_domain_stats(?, ?, ?, ?, ?, ?)");
            $stmt->execute([
                $host,
                $success,
                $responseTime,
                $httpCode,
                $bytesDownloaded,
                $item['error'] ?? null,
            ]);
        } catch (\Exception $e) {
            // Stored procedure may not exist, use direct insert
            try {
                $stmt = $this->pdo->prepare("
                    INSERT INTO crawler_domain_stats (domain, total_visits, successful_visits, failed_visits, last_visit, last_http_code)
                    VALUES (?, 1, ?, ?, NOW(), ?)
                    ON DUPLICATE KEY UPDATE
                        total_visits = total_visits + 1,
                        successful_visits = successful_visits + ?,
                        failed_visits = failed_visits + ?,
                        last_visit = NOW(),
                        last_http_code = ?
                ");
                $stmt->execute([
                    $host,
                    $success ? 1 : 0,
                    $success ? 0 : 1,
                    $httpCode,
                    $success ? 1 : 0,
                    $success ? 0 : 1,
                    $httpCode,
                ]);
            } catch (\Exception $e2) {
                // Ignore stats errors
            }
        }
    }

    /**
     * Create job record in database
     */
    private function createJobRecord(): void
    {
        if (!$this->pdo) {
            return;
        }

        try {
            $stmt = $this->pdo->prepare("
                INSERT INTO crawler_jobs (
                    id, type, mode, seed_url, search_term, status,
                    max_pages, max_depth, parallel_count, same_domain_only,
                    relevance_threshold, robots_policy, created_at
                ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, NOW())
            ");

            $stmt->execute([
                $this->jobId,
                'unified',
                $this->mode,
                $this->seedUrls[0] ?? null,
                implode(', ', $this->searchTerms),
                $this->status,
                $this->config['max_pages'],
                $this->config['max_depth'],
                $this->config['parallel_count'],
                $this->config['same_domain_only'] ? 1 : 0,
                $this->config['relevance_threshold'],
                $this->config['robots_policy'],
            ]);
        } catch (\Exception $e) {
            error_log("UnifiedCrawler::createJobRecord error: " . $e->getMessage());
        }
    }

    /**
     * Update job record
     */
    private function updateJobRecord(?string $error = null): void
    {
        if (!$this->pdo) {
            return;
        }

        try {
            $stmt = $this->pdo->prepare("
                UPDATE crawler_jobs SET
                    status = ?,
                    pages_crawled = ?,
                    pages_found = ?,
                    errors = ?,
                    last_error = ?,
                    completed_at = IF(? IN ('completed', 'failed', 'cancelled'), NOW(), NULL),
                    updated_at = NOW()
                WHERE id = ?
            ");

            $stmt->execute([
                $this->status,
                $this->processedCount,
                $this->importedCount,
                $this->errorCount,
                $error,
                $this->status,
                $this->jobId,
            ]);
        } catch (\Exception $e) {
            error_log("UnifiedCrawler::updateJobRecord error: " . $e->getMessage());
        }
    }

    /**
     * Save checkpoint for resume
     */
    private function saveCheckpoint(): void
    {
        if (!$this->pdo) {
            return;
        }

        try {
            $checkpointData = json_encode([
                'visited' => array_keys($this->visited),
                'queue' => $this->queue,
                'config' => $this->config,
                'search_terms' => $this->searchTerms,
                'seed_urls' => $this->seedUrls,
                'forced_domains' => $this->forcedDomains,
                'blocked_domains' => $this->blockedDomains,
            ]);

            $stmt = $this->pdo->prepare("
                INSERT INTO crawler_checkpoints (
                    job_id, session_id, checkpoint_data,
                    urls_processed, current_depth, queue_size
                ) VALUES (?, ?, ?, ?, ?, ?)
            ");

            $stmt->execute([
                $this->jobId,
                $this->jobId,
                $checkpointData,
                $this->processedCount,
                $this->config['max_depth'],
                count($this->queue),
            ]);

            $this->emit('checkpoint_saved', ['processed' => $this->processedCount]);

        } catch (\Exception $e) {
            error_log("UnifiedCrawler::saveCheckpoint error: " . $e->getMessage());
        }
    }

    /**
     * Emit event via callback
     */
    private function emit(string $event, array $data = []): void
    {
        if ($this->eventCallback) {
            call_user_func($this->eventCallback, $event, $data);
        }

        // Also log to activity table
        $this->logActivity($event, $data);
    }

    /**
     * Log activity to database
     */
    private function logActivity(string $eventType, array $data): void
    {
        if (!$this->pdo) {
            return;
        }

        $level = 'info';
        if (strpos($eventType, 'error') !== false) {
            $level = 'error';
        } elseif (strpos($eventType, 'completed') !== false || strpos($eventType, 'imported') !== false) {
            $level = 'success';
        } elseif (strpos($eventType, 'irrelevant') !== false || strpos($eventType, 'duplicate') !== false) {
            $level = 'warning';
        }

        try {
            $stmt = $this->pdo->prepare("
                INSERT INTO crawler_activity_log (job_id, event_type, event_level, message, context, url)
                VALUES (?, ?, ?, ?, ?, ?)
            ");

            $message = $this->formatEventMessage($eventType, $data);

            $stmt->execute([
                $this->jobId,
                $eventType,
                $level,
                $message,
                json_encode($data),
                $data['url'] ?? null,
            ]);
        } catch (\Exception $e) {
            // Ignore logging errors
        }
    }

    /**
     * Format event message for logging
     */
    private function formatEventMessage(string $event, array $data): string
    {
        switch ($event) {
            case 'started':
                return "Crawl started in {$data['mode']} mode";
            case 'completed':
                return "Crawl completed: {$data['imported']} imported, {$data['errors']} errors";
            case 'url_imported':
                return "Imported: {$data['title']}";
            case 'url_error':
                return "Error: {$data['error']}";
            case 'url_irrelevant':
                return "Skipped (irrelevant): {$data['url']}";
            case 'url_duplicate':
                return "Skipped (duplicate): {$data['url']}";
            case 'search_started':
                return "Searching: {$data['term']}";
            case 'search_completed':
                return "Search completed: {$data['results_count']} results";
            default:
                return ucfirst(str_replace('_', ' ', $event));
        }
    }

    /**
     * Load from checkpoint
     */
    public function loadFromCheckpoint(string $jobId): bool
    {
        if (!$this->pdo) {
            return false;
        }

        try {
            $stmt = $this->pdo->prepare("
                SELECT checkpoint_data FROM crawler_checkpoints
                WHERE job_id = ? ORDER BY created_at DESC LIMIT 1
            ");
            $stmt->execute([$jobId]);
            $row = $stmt->fetch();

            if (!$row) {
                return false;
            }

            $data = json_decode($row['checkpoint_data'], true);
            if (!$data) {
                return false;
            }

            $this->jobId = $jobId;
            $this->visited = array_fill_keys($data['visited'] ?? [], true);
            $this->queue = $data['queue'] ?? [];
            $this->config = $data['config'] ?? $this->config;
            $this->searchTerms = $data['search_terms'] ?? [];
            $this->seedUrls = $data['seed_urls'] ?? [];
            $this->forcedDomains = $data['forced_domains'] ?? [];
            $this->blockedDomains = $data['blocked_domains'] ?? [];

            return true;

        } catch (\Exception $e) {
            error_log("UnifiedCrawler::loadFromCheckpoint error: " . $e->getMessage());
            return false;
        }
    }
}
