<?php
/**
 * CRAWLER PIPELINE - Sistema Automatico Producer-Consumer
 * Discovery Workers → crawler_ready table → Processor Workers → pinfeeds
 * Arquivo unico completo com dashboard em tempo real
 */

// ============================================
// BLOCO 1: HEADER + DB CONNECTION
// ============================================
error_reporting(E_ALL & ~E_NOTICE & ~E_WARNING);
set_time_limit(0);
ini_set('memory_limit', '512M');

$DB_HOST = 'localhost';
$DB_NAME = 'digupdog_FEED';
$DB_USER = 'digupdog_FEEDadmin';
$DB_PASS = 'Raimundinho1';

try {
    $pdo = new PDO("mysql:host=$DB_HOST;dbname=$DB_NAME;charset=utf8mb4", $DB_USER, $DB_PASS, [
        PDO::ATTR_ERRMODE => PDO::ERRMODE_EXCEPTION,
        PDO::ATTR_DEFAULT_FETCH_MODE => PDO::FETCH_ASSOC,
        PDO::ATTR_EMULATE_PREPARES => false,
        PDO::MYSQL_ATTR_INIT_COMMAND => "SET NAMES utf8mb4"
    ]);
} catch (PDOException $e) {
    if (isset($_GET['action'])) { header('Content-Type: application/json'); die(json_encode(['error' => 'DB connection failed'])); }
    die("DB Error");
}

// Create pipeline tables
$pdo->exec("CREATE TABLE IF NOT EXISTS crawler_ready (
    id BIGINT UNSIGNED AUTO_INCREMENT PRIMARY KEY,
    url TEXT NOT NULL,
    url_hash VARCHAR(64) NOT NULL,
    html LONGTEXT DEFAULT NULL,
    http_code SMALLINT DEFAULT 0,
    source_keyword VARCHAR(500) DEFAULT NULL,
    source_engine VARCHAR(50) DEFAULT NULL,
    priority TINYINT DEFAULT 50,
    status ENUM('pending','claimed','done','failed') DEFAULT 'pending',
    process_id VARCHAR(50) DEFAULT NULL,
    claimed_at DATETIME DEFAULT NULL,
    attempts TINYINT DEFAULT 0,
    created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
    UNIQUE KEY idx_url_hash (url_hash),
    KEY idx_status_priority (status, priority DESC, id ASC),
    KEY idx_process (process_id, status),
    KEY idx_created (created_at)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4");

$pdo->exec("CREATE TABLE IF NOT EXISTS pipeline_processes (
    id VARCHAR(50) PRIMARY KEY,
    mode VARCHAR(30) DEFAULT 'pipeline',
    worker_mode VARCHAR(20) DEFAULT 'discovery',
    status ENUM('running','paused','stopped','dead') DEFAULT 'running',
    stats JSON DEFAULT NULL,
    last_heartbeat DATETIME DEFAULT CURRENT_TIMESTAMP,
    created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
    KEY idx_status (status),
    KEY idx_mode (worker_mode, status)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4");

$pdo->exec("CREATE TABLE IF NOT EXISTS crawler_pool (
    id BIGINT UNSIGNED AUTO_INCREMENT PRIMARY KEY,
    url TEXT NOT NULL,
    url_hash VARCHAR(64) NOT NULL,
    priority TINYINT UNSIGNED DEFAULT 50,
    status ENUM('pending','claimed','done','error') DEFAULT 'pending',
    process_id VARCHAR(50) DEFAULT NULL,
    claimed_at DATETIME DEFAULT NULL,
    created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
    UNIQUE KEY idx_url_hash (url_hash),
    KEY idx_status_priority (status, priority DESC, id ASC),
    KEY idx_process (process_id)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4");

$pdo->exec("CREATE TABLE IF NOT EXISTS crawler_seen (
    url_hash VARCHAR(64) PRIMARY KEY,
    url TEXT NOT NULL,
    status VARCHAR(20) DEFAULT 'seen',
    process_id VARCHAR(50) DEFAULT NULL,
    seen_at DATETIME DEFAULT CURRENT_TIMESTAMP,
    KEY idx_status (status)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4");

$pdo->exec("CREATE TABLE IF NOT EXISTS crawler_logs (
    id BIGINT UNSIGNED AUTO_INCREMENT PRIMARY KEY,
    process_id VARCHAR(50) NOT NULL,
    entries JSON DEFAULT NULL,
    created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
    KEY idx_process (process_id),
    KEY idx_created (created_at)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4");

// Ensure worker_mode column exists (if table from earlier version)
try {
    $stmt = $pdo->query("SHOW COLUMNS FROM pipeline_processes LIKE 'worker_mode'");
    if ($stmt->rowCount() === 0) {
        $pdo->exec("ALTER TABLE pipeline_processes ADD COLUMN worker_mode VARCHAR(20) DEFAULT 'discovery' AFTER mode");
    }
} catch(Exception $e) {}

// Constants
define('MP_MAX_PROCESSES', 1000);
define('MP_HEARTBEAT_SEC', 30);
define('MP_DEAD_TIMEOUT', 120);

// ============================================
// BLOCO 2: MULTI-PROCESS FUNCTIONS
// ============================================

function mpCreateProcess($prefix = 'PL') {
    return $prefix . '_' . substr(md5(uniqid(mt_rand(), true)), 0, 8) . '_' . time();
}

function mpRegisterProcess($pdo, $processId, $workerMode = 'discovery') {
    try {
        $stmt = $pdo->prepare("INSERT INTO pipeline_processes (id, mode, worker_mode, status, stats, last_heartbeat, created_at) VALUES (?, 'pipeline', ?, 'running', '{}', NOW(), NOW()) ON DUPLICATE KEY UPDATE status='running', worker_mode=?, last_heartbeat=NOW()");
        $stmt->execute([$processId, $workerMode, $workerMode]);
        return true;
    } catch (Exception $e) { return false; }
}

function mpHeartbeat($pdo, $processId, $stats = []) {
    try {
        $stmt = $pdo->prepare("UPDATE pipeline_processes SET last_heartbeat=NOW(), stats=? WHERE id=?");
        $stmt->execute([json_encode($stats), $processId]);
    } catch (Exception $e) {}
}

function mpCheckControl($pdo, $processId) {
    try {
        $stmt = $pdo->prepare("SELECT status FROM pipeline_processes WHERE id=? LIMIT 1");
        $stmt->execute([$processId]);
        $row = $stmt->fetch();
        return $row ? $row['status'] : 'dead';
    } catch (Exception $e) { return 'running'; }
}

function mpControlProcess($pdo, $processId, $action) {
    $statusMap = ['pause'=>'paused','resume'=>'running','stop'=>'stopped'];
    $newStatus = $statusMap[$action] ?? null;
    if (!$newStatus) return false;
    try {
        $stmt = $pdo->prepare("UPDATE pipeline_processes SET status=? WHERE id=?");
        $stmt->execute([$newStatus, $processId]);
        return true;
    } catch (Exception $e) { return false; }
}

function mpCleanupDead($pdo) {
    try {
        // Mark dead processes
        $pdo->exec("UPDATE pipeline_processes SET status='dead' WHERE status='running' AND last_heartbeat < DATE_SUB(NOW(), INTERVAL " . MP_DEAD_TIMEOUT . " SECOND)");
        // Release claimed URLs from dead processes back to pending
        $deadIds = $pdo->query("SELECT id FROM pipeline_processes WHERE status='dead'")->fetchAll(PDO::FETCH_COLUMN);
        if (!empty($deadIds)) {
            $placeholders = implode(',', array_fill(0, count($deadIds), '?'));
            $stmt = $pdo->prepare("UPDATE crawler_pool SET status='pending', process_id=NULL, claimed_at=NULL WHERE status='claimed' AND process_id IN ($placeholders)");
            $stmt->execute($deadIds);
            $stmt2 = $pdo->prepare("UPDATE crawler_ready SET status='pending', process_id=NULL, claimed_at=NULL WHERE status='claimed' AND process_id IN ($placeholders)");
            $stmt2->execute($deadIds);
        }
    } catch (Exception $e) {}
}

function mpCanStart($pdo) {
    try {
        $count = $pdo->query("SELECT COUNT(*) FROM pipeline_processes WHERE status IN ('running','paused')")->fetchColumn();
        return $count < MP_MAX_PROCESSES;
    } catch (Exception $e) { return true; }
}

function mpAddToPool($pdo, $urls, $processId, $priority = 50) {
    if (empty($urls)) return 0;
    $added = 0;
    $batch = [];
    foreach ($urls as $url) {
        $url = trim($url);
        if (empty($url) || strlen($url) < 10) continue;
        $hash = md5($url);
        $batch[] = [$url, $hash, min(100, max(1, (int)$priority))];
    }
    if (empty($batch)) return 0;
    try {
        $stmt = $pdo->prepare("INSERT IGNORE INTO crawler_pool (url, url_hash, priority, status, process_id, created_at) VALUES (?, ?, ?, 'pending', ?, NOW())");
        foreach ($batch as $row) {
            $stmt->execute([$row[0], $row[1], $row[2], $processId]);
            $added += $stmt->rowCount();
        }
    } catch (Exception $e) {}
    return $added;
}

function mpClaimUrls($pdo, $processId, $batchSize = 20) {
    try {
        $pdo->beginTransaction();
        $stmt = $pdo->prepare("SELECT id, url FROM crawler_pool WHERE status='pending' ORDER BY priority DESC, id ASC LIMIT ?");
        $stmt->execute([(int)$batchSize]);
        $rows = $stmt->fetchAll();
        if (!empty($rows)) {
            $ids = array_column($rows, 'id');
            $placeholders = implode(',', array_fill(0, count($ids), '?'));
            $upd = $pdo->prepare("UPDATE crawler_pool SET status='claimed', process_id=?, claimed_at=NOW() WHERE id IN ($placeholders)");
            $upd->execute(array_merge([$processId], $ids));
        }
        $pdo->commit();
        return $rows;
    } catch (Exception $e) {
        $pdo->rollBack();
        return [];
    }
}

function mpMarkDone($pdo, $ids, $status = 'done') {
    if (empty($ids)) return;
    try {
        $placeholders = implode(',', array_fill(0, count($ids), '?'));
        $stmt = $pdo->prepare("UPDATE crawler_pool SET status=? WHERE id IN ($placeholders)");
        $stmt->execute(array_merge([$status], $ids));
    } catch (Exception $e) {}
}

function mpUrlPriority($url) {
    $priority = 50;
    if (preg_match('/\/post[s]?\/|\/article|\/blog\/|\/news\/|\d{4}\/\d{2}\/|\.html?$/i', $url)) $priority += 20;
    if (preg_match('/\/tag\/|\/category\/|\/page\/\d/i', $url)) $priority -= 10;
    if (strlen($url) > 200) $priority -= 5;
    if (preg_match('/\?.*&.*&/i', $url)) $priority -= 15;
    return min(100, max(1, $priority));
}

function mpMarkSeen($pdo, $url, $processId, $status = 'seen') {
    try {
        $hash = md5($url);
        $stmt = $pdo->prepare("INSERT IGNORE INTO crawler_seen (url_hash, url, status, process_id, seen_at) VALUES (?, ?, ?, ?, NOW())");
        $stmt->execute([$hash, $url, $status, $processId]);
    } catch (Exception $e) {}
}

function mpIsSeenBatch($pdo, $urls) {
    if (empty($urls)) return [];
    $hashes = array_map('md5', $urls);
    $placeholders = implode(',', array_fill(0, count($hashes), '?'));
    try {
        $stmt = $pdo->prepare("SELECT url_hash FROM crawler_seen WHERE url_hash IN ($placeholders)");
        $stmt->execute($hashes);
        $found = $stmt->fetchAll(PDO::FETCH_COLUMN);
        $foundSet = array_flip($found);
        $result = [];
        foreach ($urls as $url) {
            $result[$url] = isset($foundSet[md5($url)]);
        }
        return $result;
    } catch (Exception $e) { return []; }
}

function mpGetOrCreateUser($pdo, $author) {
    static $cache = [];
    if (isset($cache[$author])) return $cache[$author];
    try {
        $stmt = $pdo->prepare("SELECT ID FROM user_myhashtag WHERE username = ? LIMIT 1");
        $stmt->execute([$author]);
        $id = $stmt->fetchColumn();
        if ($id) { $cache[$author] = (int)$id; return (int)$id; }
        $parts = explode(' ', $author);
        $firstName = ucfirst($parts[0]);
        $lastName = ucfirst($parts[1] ?? 'User');
        $email = strtolower(preg_replace('/[^a-zA-Z0-9]/', '', $author)) . '@digupdog.com';
        $password = password_hash('Raimundinho1', PASSWORD_DEFAULT);
        $birthdate = date('Y-m-d', strtotime('-' . rand(18, 50) . ' years'));
        $stmt = $pdo->prepare("INSERT INTO user_myhashtag (username, email, senha, first_name, last_name, address, phone_number, created_at, birthdate, profile_picture, bio, gender, status, user_role, privacy_settings, social_links, preferences) VALUES (?, ?, ?, ?, ?, '123 Web St', ?, NOW(), ?, 'https://example.com/default.jpg', 'Web explorer', ?, 'active', 'user', '{\"visibility\":\"public\"}', '{\"facebook\":\"\"}', '{\"theme\":\"dark\"}')");
        $stmt->execute([$author, $email, $password, $firstName, $lastName, '+1'.rand(1000000000,9999999999), $birthdate, rand(0,1)?'male':'female']);
        $newId = (int)$pdo->lastInsertId();
        $cache[$author] = $newId;
        return $newId;
    } catch (Exception $e) { return 0; }
}

function mpGetOrCreateDomain($pdo, $url) {
    static $cache = [];
    $host = parse_url($url, PHP_URL_HOST);
    if (!$host) return ['domain_id'=>0,'category_id'=>0];
    if (isset($cache[$host])) return $cache[$host];
    try {
        $stmt = $pdo->prepare("SELECT id, main_category_id FROM feed_data WHERE website_base LIKE ? LIMIT 1");
        $stmt->execute(['%'.$host.'%']);
        $row = $stmt->fetch();
        if ($row) { $cache[$host] = ['domain_id'=>(int)$row['id'],'category_id'=>(int)$row['main_category_id']]; return $cache[$host]; }
        $stmt = $pdo->prepare("INSERT INTO feed_data (website_base, website_feed, main_category_id) VALUES (?, ?, 0)");
        $stmt->execute(['https://'.$host, $url]);
        $cache[$host] = ['domain_id'=>(int)$pdo->lastInsertId(),'category_id'=>0];
        return $cache[$host];
    } catch (Exception $e) { return ['domain_id'=>0,'category_id'=>0]; }
}

function mpWriteLog($pdo, $processId, $entries) {
    try {
        $stmt = $pdo->prepare("INSERT INTO crawler_logs (process_id, entries, created_at) VALUES (?, ?, NOW())");
        $stmt->execute([$processId, json_encode($entries)]);
    } catch (Exception $e) {}
}

function mpImportBatch($pdo, $records) {
    if (empty($records)) return 0;
    $columns = 'title, description, thumbnail, pubDate, link, updated, source_website, author, favicon, tags, embed_code, source_domain, user_id, source_domain_id, main_category_id, title_cat_id, description_cat_id, tag_cat_id';
    $inserted = 0;
    try {
        $pdo->beginTransaction();
        foreach (array_chunk($records, 25) as $chunk) {
            $placeholders = [];
            $values = [];
            foreach ($chunk as $data) {
                $link = $data['link'] ?? '';
                if (!filter_var($link, FILTER_VALIDATE_URL)) continue;
                $host = parse_url($link, PHP_URL_HOST) ?? '';
                $author = $data['author'] ?? '';
                if (empty($author) || $author === 'Anonymous' || mb_strlen($author) < 3) $author = generateRandomAuthor();
                $userId = mpGetOrCreateUser($pdo, $author);
                $domainInfo = mpGetOrCreateDomain($pdo, $link);
                $embedCode = extractEmbedCode($link);
                $tags = $data['tags'] ?? '';
                if (empty($tags) && !empty($data['title'])) $tags = extractTagsFromTitle($data['title']);
                $placeholders[] = '(?, ?, ?, NOW(), ?, NOW(), ?, ?, ?, ?, ?, ?, ?, ?, ?, 0, 0, 0)';
                $values = array_merge($values, [
                    mb_substr($data['title'] ?? 'No title', 0, 255),
                    mb_substr($data['description'] ?? '', 0, 1000),
                    mb_substr($data['thumbnail'] ?? '', 0, 500),
                    $link,
                    mb_substr($host, 0, 255),
                    mb_substr($author, 0, 255),
                    mb_substr($data['favicon'] ?? '', 0, 255),
                    mb_substr($tags, 0, 500),
                    mb_substr($embedCode, 0, 2000),
                    mb_substr($host, 0, 255),
                    $userId,
                    $domainInfo['domain_id'],
                    $domainInfo['category_id']
                ]);
            }
            if (!empty($placeholders)) {
                $sql = "INSERT INTO pinfeeds ($columns) VALUES " . implode(', ', $placeholders);
                $stmt = $pdo->prepare($sql);
                $stmt->execute($values);
                $inserted += $stmt->rowCount();
            }
        }
        $pdo->commit();
    } catch (Exception $e) { $pdo->rollBack(); }
    return $inserted;
}

// ============================================
// BLOCO 3: READY QUEUE FUNCTIONS
// ============================================

function mpAddToReady($pdo, $urls, $processId, $config = []) {
    if (empty($urls)) return 0;
    $added = 0;
    $keyword = $config['source_keyword'] ?? '';
    $engine = $config['source_engine'] ?? '';
    try {
        $stmt = $pdo->prepare("INSERT IGNORE INTO crawler_ready (url, url_hash, source_keyword, source_engine, priority, status, created_at) VALUES (?, ?, ?, ?, ?, 'pending', NOW())");
        foreach ($urls as $url) {
            $url = trim($url);
            if (empty($url) || strlen($url) < 10) continue;
            $hash = md5($url);
            $priority = calculateUrlPriority($url, $config);
            $stmt->execute([$url, $hash, mb_substr($keyword, 0, 500), mb_substr($engine, 0, 50), $priority]);
            $added += $stmt->rowCount();
        }
    } catch (Exception $e) {}
    return $added;
}

function mpAddToReadyWithHtml($pdo, $urlData, $processId) {
    if (empty($urlData)) return 0;
    $added = 0;
    try {
        $stmt = $pdo->prepare("INSERT IGNORE INTO crawler_ready (url, url_hash, html, http_code, source_keyword, source_engine, priority, status, created_at) VALUES (?, ?, ?, ?, ?, ?, ?, 'pending', NOW())");
        foreach ($urlData as $item) {
            $url = $item['url'] ?? '';
            if (empty($url)) continue;
            $stmt->execute([
                $url, md5($url),
                $item['html'] ?? null,
                $item['http_code'] ?? 0,
                mb_substr($item['keyword'] ?? '', 0, 500),
                mb_substr($item['engine'] ?? '', 0, 50),
                calculateUrlPriority($url, [])
            ]);
            $added += $stmt->rowCount();
        }
    } catch (Exception $e) {}
    return $added;
}

function mpClaimReady($pdo, $processId, $batchSize = 20) {
    try {
        $pdo->beginTransaction();
        $stmt = $pdo->prepare("SELECT id, url, html, http_code, source_keyword, source_engine FROM crawler_ready WHERE status='pending' ORDER BY priority DESC, id ASC LIMIT ?");
        $stmt->execute([(int)$batchSize]);
        $rows = $stmt->fetchAll();
        if (!empty($rows)) {
            $ids = array_column($rows, 'id');
            $placeholders = implode(',', array_fill(0, count($ids), '?'));
            $upd = $pdo->prepare("UPDATE crawler_ready SET status='claimed', process_id=?, claimed_at=NOW() WHERE id IN ($placeholders)");
            $upd->execute(array_merge([$processId], $ids));
        }
        $pdo->commit();
        return $rows;
    } catch (Exception $e) {
        $pdo->rollBack();
        return [];
    }
}

function mpMarkReadyDone($pdo, $ids, $status = 'done') {
    if (empty($ids)) return;
    try {
        $placeholders = implode(',', array_fill(0, count($ids), '?'));
        $stmt = $pdo->prepare("UPDATE crawler_ready SET status=? WHERE id IN ($placeholders)");
        $stmt->execute(array_merge([$status], $ids));
    } catch (Exception $e) {}
}

function mpGetQueueDepth($pdo) {
    try {
        return (int)$pdo->query("SELECT COUNT(*) FROM crawler_ready WHERE status='pending'")->fetchColumn();
    } catch (Exception $e) { return 0; }
}

function mpCleanupReady($pdo, $maxAgeDays = 7) {
    try {
        $pdo->exec("DELETE FROM crawler_ready WHERE status IN ('done','failed') AND created_at < DATE_SUB(NOW(), INTERVAL $maxAgeDays DAY)");
        // Release stale claims (>10 min)
        $pdo->exec("UPDATE crawler_ready SET status='pending', process_id=NULL, claimed_at=NULL WHERE status='claimed' AND claimed_at < DATE_SUB(NOW(), INTERVAL 10 MINUTE)");
    } catch (Exception $e) {}
}

function calculateUrlPriority($url, $config) {
    $priority = 50;
    // Boost article-like URLs
    if (preg_match('/\/post[s]?\/|\/article|\/blog\/|\/news\/|\/story\/|\d{4}\/\d{2}\/|\.html?$/i', $url)) $priority += 20;
    // Boost if matches URL pattern filter
    if (!empty($config['url_pattern'])) {
        if (@preg_match('/' . $config['url_pattern'] . '/i', $url)) $priority += 15;
    }
    // Penalize pagination/tags
    if (preg_match('/\/tag\/|\/category\/|\/page\/\d|\/search\?/i', $url)) $priority -= 10;
    // Penalize very long URLs
    if (strlen($url) > 200) $priority -= 5;
    // Penalize many query params
    if (preg_match('/\?.*&.*&.*&/i', $url)) $priority -= 15;
    return min(100, max(1, $priority));
}

// ============================================
// BLOCO 4: DISCOVERY WORKER
// ============================================

function runDiscoveryWorker($processId, $keywords, $config, $pdo) {
    ignore_user_abort(true);
    set_time_limit(0);
    ini_set('memory_limit', '512M');

    if (!mpRegisterProcess($pdo, $processId, 'discovery')) return;

    $waveSize = $config['wave_size'] ?? 50;
    $maxDiscovered = $config['max_discovered'] ?? 100000;
    $maxQueueDepth = $config['max_queue_depth'] ?? 5000;
    $linksPerPage = min($config['links_per_page'] ?? 150, 250);
    $preFetchHtml = !empty($config['pre_fetch']);
    $includeTerms = $config['include_terms'] ?? [];
    $excludeTerms = $config['exclude_terms'] ?? [];
    $forcedDomains = $config['forced_domains'] ?? [];
    $urlPattern = $config['url_pattern'] ?? '';
    $fetchDelay = $config['fetch_delay'] ?? 0;
    $followExternal = $config['follow_external'] ?? true;
    $postsFirst = $config['posts_first'] ?? true;
    $enablePagination = $config['enable_pagination'] ?? true;
    $maxDepth = $config['max_depth'] ?? 5;

    // Enabled search engines
    $enabledEngines = [];
    $engineList = ['searxng','google','bing','bing_regional','duckduckgo','yahoo','yandex','brave','wikipedia','baidu','direct'];
    foreach ($engineList as $eng) {
        if (!empty($config['engine_' . $eng])) $enabledEngines[] = $eng;
    }
    if (empty($enabledEngines)) $enabledEngines = ['searxng','google','bing','duckduckgo'];

    $stats = ['discovered'=>0,'queued'=>0,'queue_depth'=>0,'waves'=>0,'rate'=>0,'status'=>'running','engines_used'=>implode(',', $enabledEngines)];
    $domainFailCount = [];
    $MAX_DOMAIN_FAILS = 4;
    $lastHeartbeat = time();
    $lastCleanup = time();
    $startTime = time();

    // SEED PHASE
    $seedCount = 0;
    foreach ($keywords as $kw) {
        $kw = trim($kw);
        if (empty($kw)) continue;
        if (preg_match('/^https?:\/\//i', $kw)) {
            mpAddToPool($pdo, [$kw], $processId, 90);
            $seedCount++;
            continue;
        }
        foreach ($enabledEngines as $engine) {
            $links = callSearchEngine($engine, $kw, 20, $config);
            $validLinks = [];
            foreach ($links as $link) {
                $host = parse_url($link, PHP_URL_HOST) ?? '';
                if (!isDomainSkipped($host)) $validLinks[] = $link;
            }
            if (!empty($validLinks)) {
                $seedCount += mpAddToPool($pdo, $validLinks, $processId, mpUrlPriority($validLinks[0]));
            }
            if ($fetchDelay > 0) usleep($fetchDelay * 1000);
        }
    }
    $stats['discovered'] = $seedCount;
    mpHeartbeat($pdo, $processId, $stats);
    mpWriteLog($pdo, $processId, [['url'=>'','status'=>'imported','title'=>"Discovery seeded: $seedCount URLs from ".count($enabledEngines)." engines",'domain'=>'SYSTEM']]);

    // MAIN DISCOVERY LOOP
    $reseedIndex = 0;
    while ($stats['discovered'] < $maxDiscovered) {
        $status = mpCheckControl($pdo, $processId);
        if ($status === 'stopped' || $status === 'dead') break;
        while ($status === 'paused') { sleep(2); $status = mpCheckControl($pdo, $processId); if ($status === 'stopped') break 2; }

        if (time() - $lastHeartbeat >= MP_HEARTBEAT_SEC) {
            $elapsed = max(1, time() - $startTime);
            $stats['rate'] = round($stats['discovered'] / $elapsed, 2);
            $stats['queue_depth'] = mpGetQueueDepth($pdo);
            mpHeartbeat($pdo, $processId, $stats);
            $lastHeartbeat = time();
        }
        if (time() - $lastCleanup >= 300) { mpCleanupDead($pdo); mpCleanupReady($pdo); $lastCleanup = time(); }

        // BACKPRESSURE
        $queueDepth = mpGetQueueDepth($pdo);
        $stats['queue_depth'] = $queueDepth;
        if ($queueDepth > $maxQueueDepth) {
            $stats['status'] = 'backpressure';
            mpHeartbeat($pdo, $processId, $stats);
            sleep(5);
            continue;
        }
        $stats['status'] = 'running';

        $effectiveWave = $waveSize;
        if ($queueDepth > $maxQueueDepth * 0.8) $effectiveWave = max(5, (int)($waveSize / 4));
        elseif ($queueDepth < 100) $effectiveWave = min(100, $waveSize * 2);

        $claimed = mpClaimUrls($pdo, $processId, $effectiveWave);
        if (empty($claimed)) {
            $reseedIndex++;
            $kw = $keywords[array_rand($keywords)];
            if (!preg_match('/^https?:\/\//i', $kw)) {
                $variations = ['', ' news', ' blog', ' article', ' latest', ' best', ' top', ' guide'];
                $variedKw = trim($kw . ' ' . ($variations[$reseedIndex % count($variations)]));
                $engine = $enabledEngines[$reseedIndex % count($enabledEngines)];
                $newSeeds = callSearchEngine($engine, $variedKw, 20, $config);
                $validSeeds = array_filter($newSeeds, function($l) { return !isDomainSkipped(parse_url($l, PHP_URL_HOST) ?? ''); });
                if (!empty($validSeeds)) { $stats['discovered'] += mpAddToPool($pdo, $validSeeds, $processId, 50); }
            }
            usleep(500000);
            continue;
        }

        $wave = array_column($claimed, 'url');
        $waveIds = array_column($claimed, 'id');
        $stats['waves']++;
        $results = fetchBatchWithCurl($wave, 4, count($wave), $config);

        foreach ($results as $url => $result) {
            $fetchDomain = parse_url($url, PHP_URL_HOST) ?? '';
            if ($result['http_code'] < 200 || $result['http_code'] >= 400 || empty($result['html'])) {
                $domainFailCount[$fetchDomain] = ($domainFailCount[$fetchDomain] ?? 0) + 1;
                mpMarkSeen($pdo, $url, $processId, 'error');
                continue;
            }
            $domainFailCount[$fetchDomain] = max(0, ($domainFailCount[$fetchDomain] ?? 0) - 1);
            $html = $result['html'];
            $isSerpPage = preg_match('/google\.|bing\.|yahoo\.|yandex\.|duckduckgo\.|baidu\.|ecosia\.|brave\.|searx/i', $fetchDomain);

            if ($isSerpPage) {
                $pageLinks = extractSearchResultsFromUrl($url, $html);
                if (empty($pageLinks)) $pageLinks = extractAllLinks($html, $url, $linksPerPage, $includeTerms, $excludeTerms, $config);
            } else {
                $pageLinks = extractAllLinks($html, $url, $linksPerPage, $includeTerms, $excludeTerms, $config);
            }

            $newLinks = [];
            foreach ($pageLinks as $pl) {
                $plHost = parse_url($pl, PHP_URL_HOST) ?? '';
                if (isDomainSkipped($plHost)) continue;
                if (($domainFailCount[$plHost] ?? 0) >= $MAX_DOMAIN_FAILS) continue;
                if (preg_match('/\/wp-content\/|\/wp-includes\/|\/wp-admin\/|\/cdn-cgi\/|\/feed\/?$|\/xmlrpc|\/wp-json|^javascript:|^mailto:/i', $pl)) continue;
                if (!empty($urlPattern) && !@preg_match('/' . $urlPattern . '/i', $pl)) continue;
                $newLinks[] = $pl;
            }

            if (!empty($newLinks)) {
                $seenCheck = mpIsSeenBatch($pdo, array_slice($newLinks, 0, 200));
                $unseenLinks = array_filter($newLinks, function($l) use ($seenCheck) { return !($seenCheck[$l] ?? false); });
                if (!empty($unseenLinks)) {
                    if ($preFetchHtml) {
                        $preFetchBatch = array_slice($unseenLinks, 0, 30);
                        $preFetchResults = fetchBatchWithCurl($preFetchBatch, 3, count($preFetchBatch), $config);
                        $readyData = [];
                        foreach ($preFetchResults as $pUrl => $pResult) {
                            if ($pResult['http_code'] >= 200 && $pResult['http_code'] < 400 && !empty($pResult['html'])) {
                                $readyData[] = ['url'=>$pUrl,'html'=>$pResult['html'],'http_code'=>$pResult['http_code'],'keyword'=>$keywords[0]??'','engine'=>'discovery'];
                            }
                        }
                        $queued = mpAddToReadyWithHtml($pdo, $readyData, $processId);
                    } else {
                        $readyConfig = ['source_keyword'=>$keywords[0]??'','source_engine'=>'discovery','url_pattern'=>$urlPattern];
                        $queued = mpAddToReady($pdo, array_slice($unseenLinks, 0, 200), $processId, $readyConfig);
                    }
                    $stats['queued'] += $queued;
                    $stats['discovered'] += count($unseenLinks);
                    mpAddToPool($pdo, array_slice($unseenLinks, 0, 50), $processId, mpUrlPriority($unseenLinks[0] ?? ''));
                }
            }
            mpMarkSeen($pdo, $url, $processId, 'crawled');
            if ($fetchDelay > 0) usleep($fetchDelay * 1000);
        }
        mpMarkDone($pdo, $waveIds, 'done');
    }

    $stats['status'] = 'finished';
    mpHeartbeat($pdo, $processId, $stats);
    mpWriteLog($pdo, $processId, [['url'=>'','status'=>'imported','title'=>"Discovery finished: {$stats['discovered']} discovered, {$stats['queued']} queued",'domain'=>'SYSTEM']]);
}

// ============================================
// BLOCO 5: PROCESSOR WORKER
// ============================================

function runProcessorWorker($processId, $keywords, $config, $pdo) {
    ignore_user_abort(true);
    set_time_limit(0);
    ini_set('memory_limit', '512M');

    if (!mpRegisterProcess($pdo, $processId, 'processor')) return;

    $batchSize = $config['batch_size'] ?? 20;
    $qualityThreshold = $config['quality_threshold'] ?? 20;
    $relevanceThreshold = $config['relevance_threshold'] ?? 2;
    $maxImports = $config['max_imports'] ?? 1000000;
    $includeTerms = $config['include_terms'] ?? [];
    $excludeTerms = $config['exclude_terms'] ?? [];
    $forcedDomains = $config['forced_domains'] ?? [];
    $fetchDelay = $config['fetch_delay'] ?? 0;

    $stats = ['processed'=>0,'imported'=>0,'duplicates'=>0,'errors'=>0,'skipped'=>0,'queue_depth'=>0,'avg_ms'=>0,'status'=>'running'];
    $lastHeartbeat = time();
    $lastCleanup = time();
    $startTime = time();
    $totalProcessTime = 0;
    $emptyQueueCount = 0;

    while ($stats['imported'] < $maxImports) {
        $status = mpCheckControl($pdo, $processId);
        if ($status === 'stopped' || $status === 'dead') break;
        while ($status === 'paused') { sleep(2); $status = mpCheckControl($pdo, $processId); if ($status === 'stopped') break 2; }

        if (time() - $lastHeartbeat >= MP_HEARTBEAT_SEC) {
            $stats['queue_depth'] = mpGetQueueDepth($pdo);
            if ($stats['processed'] > 0) $stats['avg_ms'] = round($totalProcessTime / $stats['processed']);
            mpHeartbeat($pdo, $processId, $stats);
            $lastHeartbeat = time();
        }
        if (time() - $lastCleanup >= 300) { mpCleanupDead($pdo); mpCleanupReady($pdo); $lastCleanup = time(); }

        // AUTO-SCALE batch
        $queueDepth = mpGetQueueDepth($pdo);
        $stats['queue_depth'] = $queueDepth;
        $effectiveBatch = $batchSize;
        if ($queueDepth > 2000) $effectiveBatch = min(50, (int)($batchSize * 1.5));
        elseif ($queueDepth < 200 && $queueDepth > 0) $effectiveBatch = max(5, (int)($batchSize * 0.7));

        $claimed = mpClaimReady($pdo, $processId, $effectiveBatch);
        if (empty($claimed)) {
            $emptyQueueCount++;
            try { $activeDiscovery = (int)$pdo->query("SELECT COUNT(*) FROM pipeline_processes WHERE worker_mode='discovery' AND status IN ('running','paused')")->fetchColumn(); } catch(Exception $e) { $activeDiscovery = 0; }
            if ($activeDiscovery == 0 && $emptyQueueCount >= 10) break;
            sleep(2);
            continue;
        }
        $emptyQueueCount = 0;

        $importQueue = [];
        $doneIds = [];
        $failedIds = [];

        foreach ($claimed as $item) {
            $processStart = microtime(true);
            $url = $item['url'];
            $html = $item['html'] ?? '';
            $httpCode = $item['http_code'] ?? 0;

            if (empty($html)) {
                $fetchResult = fetchWithCurl($url, $config);
                $html = $fetchResult['html'] ?? '';
                $httpCode = $fetchResult['http_code'] ?? 0;
                if ($fetchDelay > 0) usleep($fetchDelay * 1000);
            }

            if ($httpCode < 200 || $httpCode >= 400 || empty($html)) {
                $stats['errors']++;
                $failedIds[] = $item['id'];
                mpMarkSeen($pdo, $url, $processId, 'error');
                continue;
            }

            $noindex = hasNoindexNofollow($html);
            if ($noindex['noindex']) { $stats['skipped']++; $doneIds[] = $item['id']; mpMarkSeen($pdo, $url, $processId, 'noindex'); continue; }

            $meta = extractMetadata($url, $html);

            if (!empty($includeTerms)) {
                $passesInclude = false;
                $cleanBody = preg_replace('/<(script|style|noscript|nav|footer|header)[^>]*>.*?<\/\1>/si', '', $html);
                $bodyText = mb_substr(trim(preg_replace('/\s+/', ' ', strip_tags($cleanBody))), 0, 3000);
                $searchable = strtolower($url . ' ' . ($meta['title'] ?? '') . ' ' . ($meta['description'] ?? '') . ' ' . $bodyText);
                foreach ($includeTerms as $term) { if (!empty($term) && stripos($searchable, $term) !== false) { $passesInclude = true; break; } }
                if (!$passesInclude) {
                    $urlHost = parse_url($url, PHP_URL_HOST) ?? '';
                    $isForcedDomain = false;
                    foreach ($forcedDomains as $fd) { if (!empty($fd) && stripos($urlHost, $fd) !== false) { $isForcedDomain = true; break; } }
                    if (!$isForcedDomain) { $stats['skipped']++; $doneIds[] = $item['id']; continue; }
                }
            }

            $canonical = getCanonicalUrl($html, $url);
            if ($canonical && $canonical !== $url) {
                $canonSeen = mpIsSeenBatch($pdo, [$canonical]);
                if (!empty($canonSeen[$canonical])) { $stats['duplicates']++; $doneIds[] = $item['id']; continue; }
            }

            $qualityScore = scoreContentQuality($url, $html, $meta);
            if ($qualityScore < $qualityThreshold) { $stats['skipped']++; $doneIds[] = $item['id']; mpMarkSeen($pdo, $url, $processId, 'low_quality'); continue; }

            $relevanceScore = calculateRelevanceScore($url, $meta, $keywords);
            if ($relevanceScore < $relevanceThreshold && empty($forcedDomains)) { $stats['skipped']++; $doneIds[] = $item['id']; mpMarkSeen($pdo, $url, $processId, 'low_relevance'); continue; }

            try { $dupStmt = $pdo->prepare("SELECT COUNT(*) FROM pinfeeds WHERE link = ? LIMIT 1"); $dupStmt->execute([$url]);
                if ($dupStmt->fetchColumn() > 0) { $stats['duplicates']++; $doneIds[] = $item['id']; mpMarkSeen($pdo, $url, $processId, 'duplicate'); continue; }
            } catch (Exception $e) {}

            $importQueue[] = ['title'=>$meta['title']??'No title','description'=>$meta['description']??'','thumbnail'=>$meta['thumbnail']??'','link'=>$url,'author'=>$meta['author']??'','favicon'=>$meta['favicon']??'','tags'=>$meta['tags']??''];
            $doneIds[] = $item['id'];
            mpMarkSeen($pdo, $url, $processId, 'imported');
            $totalProcessTime += (int)((microtime(true) - $processStart) * 1000);
            $stats['processed']++;

            if (count($importQueue) >= 25) {
                $stats['imported'] += mpImportBatch($pdo, $importQueue);
                mpWriteLog($pdo, $processId, array_map(function($r){return['url'=>$r['link'],'status'=>'imported','title'=>$r['title'],'domain'=>parse_url($r['link'],PHP_URL_HOST)??''];}, array_slice($importQueue,0,5)));
                $importQueue = [];
            }
        }

        if (!empty($importQueue)) {
            $stats['imported'] += mpImportBatch($pdo, $importQueue);
            mpWriteLog($pdo, $processId, array_map(function($r){return['url'=>$r['link'],'status'=>'imported','title'=>$r['title'],'domain'=>parse_url($r['link'],PHP_URL_HOST)??''];}, array_slice($importQueue,0,5)));
        }
        mpMarkReadyDone($pdo, $doneIds, 'done');
        mpMarkReadyDone($pdo, $failedIds, 'failed');
    }

    $stats['status'] = 'finished';
    if ($stats['processed'] > 0) $stats['avg_ms'] = round($totalProcessTime / $stats['processed']);
    mpHeartbeat($pdo, $processId, $stats);
    mpWriteLog($pdo, $processId, [['url'=>'','status'=>'imported','title'=>"Processor finished: {$stats['imported']} imported, {$stats['processed']} processed",'domain'=>'SYSTEM']]);
}

function callSearchEngine($engine, $query, $limit, $config = []) {
    switch ($engine) {
        case 'searxng': return getSearXNGLinks($query, $limit);
        case 'google': return getGoogleSearchLinks($query, $limit);
        case 'bing': return getBingSearchLinks($query, $limit);
        case 'bing_regional': return getBingRegionalLinks($query, $limit);
        case 'duckduckgo': return getDuckDuckGoLinks($query, $limit);
        case 'yahoo': return getYahooSearchLinks($query, $limit);
        case 'yandex': return getYandexSearchLinks($query, $limit);
        case 'brave': return getBraveSearchLinks($query, $limit);
        case 'wikipedia': return getWikipediaLinks($query, $limit);
        case 'baidu': return getBaiduSearchLinks($query, $limit);
        case 'direct': return getDirectSearchLinks($query, $limit);
        default: return [];
    }
}

// ============================================
// BLOCO 6: SEARCH ENGINE FUNCTIONS
// ============================================

function getRotatingUserAgents() {
    return [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
        'Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36',
        'Mozilla/5.0 (iPhone; CPU iPhone OS 17_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Mobile/15E148 Safari/604.1',
    ];
}

function isDomainSkipped($host) {
    static $hashSet = null;
    if ($hashSet === null) {
        $domains = [
            'facebook.com','twitter.com','x.com','instagram.com','linkedin.com',
            'pinterest.com','tiktok.com','reddit.com','tumblr.com','snapchat.com',
            'threads.net','mastodon.social','bsky.app',
            'whatsapp.com','wa.me','telegram.org','t.me','signal.org',
            'discord.com','discord.gg','slack.com',
            'google.com','bing.com','yahoo.com','yandex.ru','yandex.com',
            'youtube.com','duckduckgo.com','baidu.com',
            't.co','bit.ly','goo.gl','tinyurl.com','ow.ly','is.gd','buff.ly',
            'amazon.com','ebay.com','apple.com','aliexpress.com','shopify.com',
            'wordpress.org','developer.wordpress.org','developer.mozilla.org',
            'w3.org','schema.org','github.com','gitlab.com','bitbucket.org',
            'stackoverflow.com','stackexchange.com',
            'wikipedia.org','wikimedia.org','wikidata.org',
            'cloudflare.com','jsdelivr.net','unpkg.com',
            'fonts.googleapis.com','fonts.gstatic.com',
            'googletagmanager.com','googlesyndication.com','doubleclick.net',
            'google-analytics.com','fbcdn.net','akamaihd.net',
            'accounts.google.com','login.microsoftonline.com',
            'play.google.com','apps.apple.com','chrome.google.com',
            'web.archive.org','archive.org',
            'docs.google.com','drive.google.com','dropbox.com',
        ];
        $hashSet = array_flip($domains);
    }
    $host = strtolower($host);
    if (isset($hashSet[$host])) return true;
    $parts = explode('.', $host);
    for ($i = 1; $i < count($parts) - 1; $i++) {
        $parent = implode('.', array_slice($parts, $i));
        if (isset($hashSet[$parent])) return true;
    }
    return false;
}

function getSearXNGLinks($query, $limit = 20) {
    $instances = [
        'https://search.ononoki.org', 'https://searx.tiekoetter.com',
        'https://search.sapti.me', 'https://searx.be', 'https://search.neet.works'
    ];
    $links = [];
    $instance = $instances[array_rand($instances)];
    $url = $instance . '/search?q=' . urlencode($query) . '&format=json&categories=general&language=auto';
    $agents = getRotatingUserAgents();
    $ch = curl_init();
    curl_setopt_array($ch, [CURLOPT_URL => $url, CURLOPT_RETURNTRANSFER => true, CURLOPT_TIMEOUT => 15, CURLOPT_FOLLOWLOCATION => true, CURLOPT_USERAGENT => $agents[array_rand($agents)], CURLOPT_SSL_VERIFYPEER => false]);
    $response = curl_exec($ch);
    curl_close($ch);
    if ($response) {
        $data = json_decode($response, true);
        if (isset($data['results']) && is_array($data['results'])) {
            foreach ($data['results'] as $r) {
                if (!empty($r['url']) && filter_var($r['url'], FILTER_VALIDATE_URL)) {
                    $links[] = $r['url'];
                    if (count($links) >= $limit) break;
                }
            }
        }
    }
    return $links;
}

function getGoogleSearchLinks($query, $limit = 20) {
    $links = [];
    $agents = getRotatingUserAgents();
    $url = 'https://www.google.com/search?q=' . urlencode($query) . '&num=' . min($limit, 100);
    $ch = curl_init();
    curl_setopt_array($ch, [CURLOPT_URL => $url, CURLOPT_RETURNTRANSFER => true, CURLOPT_TIMEOUT => 15, CURLOPT_FOLLOWLOCATION => true, CURLOPT_USERAGENT => $agents[array_rand($agents)], CURLOPT_SSL_VERIFYPEER => false, CURLOPT_HTTPHEADER => ['Accept-Language: en-US,en;q=0.9']]);
    $html = curl_exec($ch);
    curl_close($ch);
    if ($html) {
        preg_match_all('/href="\/url\?q=([^&"]+)/i', $html, $m);
        foreach ($m[1] as $u) { $u = urldecode($u); if (filter_var($u, FILTER_VALIDATE_URL) && !preg_match('/google\./i', $u)) { $links[] = $u; if (count($links)>=$limit) break; } }
        if (empty($links)) {
            preg_match_all('/<a[^>]+href="(https?:\/\/[^"]+)"/i', $html, $m2);
            foreach ($m2[1] as $u) { if (!preg_match('/google\.|gstatic\.|googleapis\.|youtube\./i', $u)) { $links[] = $u; if (count($links)>=$limit) break; } }
        }
    }
    return $links;
}

function getBingSearchLinks($query, $limit = 15) {
    $links = [];
    $agents = getRotatingUserAgents();
    $url = 'https://www.bing.com/search?q=' . urlencode($query) . '&count=' . min($limit, 50);
    $ch = curl_init();
    curl_setopt_array($ch, [CURLOPT_URL => $url, CURLOPT_RETURNTRANSFER => true, CURLOPT_TIMEOUT => 15, CURLOPT_FOLLOWLOCATION => true, CURLOPT_USERAGENT => $agents[array_rand($agents)], CURLOPT_SSL_VERIFYPEER => false]);
    $html = curl_exec($ch);
    curl_close($ch);
    if ($html) {
        preg_match_all('/<a[^>]+href="(https?:\/\/[^"]+)"/i', $html, $m);
        foreach ($m[1] as $u) { if (!preg_match('/bing\.|microsoft\.|msn\.|live\./i', $u) && filter_var($u, FILTER_VALIDATE_URL)) { $links[] = $u; if (count($links)>=$limit) break; } }
    }
    return $links;
}

function getBingRegionalLinks($query, $limit = 15) {
    $regions = ['pt-BR','es-ES','fr-FR','de-DE','it-IT','ja-JP','en-GB'];
    $region = $regions[array_rand($regions)];
    $links = [];
    $agents = getRotatingUserAgents();
    $url = 'https://www.bing.com/search?q=' . urlencode($query) . '&setlang=' . $region . '&count=' . min($limit, 50);
    $ch = curl_init();
    curl_setopt_array($ch, [CURLOPT_URL => $url, CURLOPT_RETURNTRANSFER => true, CURLOPT_TIMEOUT => 15, CURLOPT_FOLLOWLOCATION => true, CURLOPT_USERAGENT => $agents[array_rand($agents)], CURLOPT_SSL_VERIFYPEER => false]);
    $html = curl_exec($ch);
    curl_close($ch);
    if ($html) {
        preg_match_all('/<a[^>]+href="(https?:\/\/[^"]+)"/i', $html, $m);
        foreach ($m[1] as $u) { if (!preg_match('/bing\.|microsoft\.|msn\./i', $u) && filter_var($u, FILTER_VALIDATE_URL)) { $links[] = $u; if (count($links)>=$limit) break; } }
    }
    return $links;
}

function getDuckDuckGoLinks($query, $limit = 15) {
    $links = [];
    $agents = getRotatingUserAgents();
    $url = 'https://html.duckduckgo.com/html/?q=' . urlencode($query);
    $ch = curl_init();
    curl_setopt_array($ch, [CURLOPT_URL => $url, CURLOPT_RETURNTRANSFER => true, CURLOPT_TIMEOUT => 15, CURLOPT_FOLLOWLOCATION => true, CURLOPT_USERAGENT => $agents[array_rand($agents)], CURLOPT_SSL_VERIFYPEER => false]);
    $html = curl_exec($ch);
    curl_close($ch);
    if ($html) {
        preg_match_all('/class="result__a"[^>]*href="([^"]+)"/i', $html, $m);
        if (empty($m[1])) preg_match_all('/href="(https?:\/\/[^"]+)"[^>]*class="result__a"/i', $html, $m);
        foreach (($m[1] ?? []) as $u) {
            if (preg_match('/uddg=([^&]+)/i', $u, $decoded)) $u = urldecode($decoded[1]);
            if (filter_var($u, FILTER_VALIDATE_URL) && !preg_match('/duckduckgo\./i', $u)) { $links[] = $u; if (count($links)>=$limit) break; }
        }
    }
    return $links;
}

function getYahooSearchLinks($query, $limit = 10) {
    $links = [];
    $agents = getRotatingUserAgents();
    $url = 'https://search.yahoo.com/search?p=' . urlencode($query) . '&n=' . min($limit, 50);
    $ch = curl_init();
    curl_setopt_array($ch, [CURLOPT_URL => $url, CURLOPT_RETURNTRANSFER => true, CURLOPT_TIMEOUT => 15, CURLOPT_FOLLOWLOCATION => true, CURLOPT_USERAGENT => $agents[array_rand($agents)], CURLOPT_SSL_VERIFYPEER => false]);
    $html = curl_exec($ch);
    curl_close($ch);
    if ($html) {
        preg_match_all('/RU=([^\/]+)\/RK/i', $html, $m);
        foreach ($m[1] as $u) { $u = urldecode($u); if (filter_var($u, FILTER_VALIDATE_URL) && !preg_match('/yahoo\./i', $u)) { $links[] = $u; if (count($links)>=$limit) break; } }
        if (empty($links)) {
            preg_match_all('/<a[^>]+href="(https?:\/\/[^"]+)"/i', $html, $m2);
            foreach ($m2[1] as $u) { if (!preg_match('/yahoo\.|yimg\./i', $u)) { $links[] = $u; if (count($links)>=$limit) break; } }
        }
    }
    return $links;
}

function getYandexSearchLinks($query, $limit = 10) {
    $links = [];
    $agents = getRotatingUserAgents();
    $url = 'https://yandex.com/search/?text=' . urlencode($query);
    $ch = curl_init();
    curl_setopt_array($ch, [CURLOPT_URL => $url, CURLOPT_RETURNTRANSFER => true, CURLOPT_TIMEOUT => 15, CURLOPT_FOLLOWLOCATION => true, CURLOPT_USERAGENT => $agents[array_rand($agents)], CURLOPT_SSL_VERIFYPEER => false]);
    $html = curl_exec($ch);
    curl_close($ch);
    if ($html) {
        preg_match_all('/<a[^>]+href="(https?:\/\/[^"]+)"[^>]*class="[^"]*organic[^"]*"/i', $html, $m);
        if (empty($m[1])) preg_match_all('/<a[^>]+href="(https?:\/\/[^"]+)"/i', $html, $m);
        foreach ($m[1] as $u) { if (!preg_match('/yandex\.|yastatic\./i', $u) && filter_var($u, FILTER_VALIDATE_URL)) { $links[] = $u; if (count($links)>=$limit) break; } }
    }
    return $links;
}

function getBraveSearchLinks($query, $limit = 15) {
    $links = [];
    $agents = getRotatingUserAgents();
    $url = 'https://search.brave.com/search?q=' . urlencode($query);
    $ch = curl_init();
    curl_setopt_array($ch, [CURLOPT_URL => $url, CURLOPT_RETURNTRANSFER => true, CURLOPT_TIMEOUT => 15, CURLOPT_FOLLOWLOCATION => true, CURLOPT_USERAGENT => $agents[array_rand($agents)], CURLOPT_SSL_VERIFYPEER => false]);
    $html = curl_exec($ch);
    curl_close($ch);
    if ($html) {
        preg_match_all('/<a[^>]+href="(https?:\/\/[^"]+)"[^>]*class="[^"]*result-header[^"]*"/i', $html, $m);
        if (empty($m[1])) preg_match_all('/<a[^>]+href="(https?:\/\/[^"]+)"/i', $html, $m);
        foreach ($m[1] as $u) { if (!preg_match('/brave\./i', $u) && filter_var($u, FILTER_VALIDATE_URL)) { $links[] = $u; if (count($links)>=$limit) break; } }
    }
    return $links;
}

function getWikipediaLinks($query, $limit = 10) {
    $links = [];
    $url = 'https://en.wikipedia.org/w/api.php?action=opensearch&search=' . urlencode($query) . '&limit=' . $limit . '&format=json';
    $ch = curl_init();
    curl_setopt_array($ch, [CURLOPT_URL => $url, CURLOPT_RETURNTRANSFER => true, CURLOPT_TIMEOUT => 10, CURLOPT_SSL_VERIFYPEER => false]);
    $response = curl_exec($ch);
    curl_close($ch);
    if ($response) {
        $data = json_decode($response, true);
        if (isset($data[3]) && is_array($data[3])) { foreach ($data[3] as $u) { $links[] = $u; if (count($links)>=$limit) break; } }
    }
    return $links;
}

function getBaiduSearchLinks($query, $limit = 10) {
    $links = [];
    $agents = getRotatingUserAgents();
    $url = 'https://www.baidu.com/s?wd=' . urlencode($query);
    $ch = curl_init();
    curl_setopt_array($ch, [CURLOPT_URL => $url, CURLOPT_RETURNTRANSFER => true, CURLOPT_TIMEOUT => 15, CURLOPT_FOLLOWLOCATION => true, CURLOPT_USERAGENT => $agents[array_rand($agents)], CURLOPT_SSL_VERIFYPEER => false]);
    $html = curl_exec($ch);
    curl_close($ch);
    if ($html) {
        preg_match_all('/<a[^>]+href="(https?:\/\/[^"]+)"/i', $html, $m);
        foreach ($m[1] as $u) { if (!preg_match('/baidu\.|bdstatic\.|baidustatic\./i', $u) && filter_var($u, FILTER_VALIDATE_URL)) { $links[] = $u; if (count($links)>=$limit) break; } }
    }
    return $links;
}

function getDirectSearchLinks($query, $limit = 10) {
    $sites = ['medium.com','dev.to','hackernews.com','techcrunch.com','wired.com','theverge.com','arstechnica.com','bbc.com','reuters.com','bloomberg.com'];
    $links = [];
    foreach (array_slice($sites, 0, 3) as $site) {
        $siteLinks = getGoogleSearchLinks("site:$site $query", 5);
        $links = array_merge($links, $siteLinks);
        if (count($links) >= $limit) break;
    }
    return array_slice($links, 0, $limit);
}

// ============================================
// BLOCO 7: CONTENT EXTRACTION FUNCTIONS
// ============================================

function fetchWithCurl($url, $config = []) {
    $agents = getRotatingUserAgents();
    $ch = curl_init();
    $opts = [
        CURLOPT_URL => $url,
        CURLOPT_RETURNTRANSFER => true,
        CURLOPT_TIMEOUT => 20,
        CURLOPT_FOLLOWLOCATION => true,
        CURLOPT_MAXREDIRS => 5,
        CURLOPT_USERAGENT => $agents[array_rand($agents)],
        CURLOPT_SSL_VERIFYPEER => false,
        CURLOPT_ENCODING => 'gzip, deflate',
        CURLOPT_HTTPHEADER => ['Accept: text/html,application/xhtml+xml', 'Accept-Language: en-US,en;q=0.9']
    ];
    if (!empty($config['http_user']) && !empty($config['http_pass'])) {
        $opts[CURLOPT_USERPWD] = $config['http_user'] . ':' . $config['http_pass'];
    }
    curl_setopt_array($ch, $opts);
    $html = curl_exec($ch);
    $httpCode = (int)curl_getinfo($ch, CURLINFO_HTTP_CODE);
    curl_close($ch);
    return ['html' => $html ?: '', 'http_code' => $httpCode];
}

function fetchBatchWithCurl($urls, $concurrency = 5, $maxUrls = 50, $config = []) {
    $urls = array_slice($urls, 0, $maxUrls);
    if (empty($urls)) return [];
    $results = [];
    $agents = getRotatingUserAgents();
    $mh = curl_multi_init();
    $handles = [];

    foreach ($urls as $url) {
        $ch = curl_init();
        $opts = [
            CURLOPT_URL => $url,
            CURLOPT_RETURNTRANSFER => true,
            CURLOPT_TIMEOUT => 20,
            CURLOPT_FOLLOWLOCATION => true,
            CURLOPT_MAXREDIRS => 5,
            CURLOPT_USERAGENT => $agents[array_rand($agents)],
            CURLOPT_SSL_VERIFYPEER => false,
            CURLOPT_ENCODING => 'gzip, deflate',
            CURLOPT_HTTPHEADER => ['Accept: text/html,application/xhtml+xml']
        ];
        if (!empty($config['http_user']) && !empty($config['http_pass'])) {
            $opts[CURLOPT_USERPWD] = $config['http_user'] . ':' . $config['http_pass'];
        }
        curl_setopt_array($ch, $opts);
        curl_multi_add_handle($mh, $ch);
        $handles[$url] = $ch;
    }

    $running = null;
    do { curl_multi_exec($mh, $running); if ($running) curl_multi_select($mh, 1); } while ($running > 0);

    foreach ($handles as $url => $ch) {
        $results[$url] = [
            'html' => curl_multi_getcontent($ch) ?: '',
            'http_code' => (int)curl_getinfo($ch, CURLINFO_HTTP_CODE),
            'time' => round(curl_getinfo($ch, CURLINFO_TOTAL_TIME), 3)
        ];
        curl_multi_remove_handle($mh, $ch);
        curl_close($ch);
    }
    curl_multi_close($mh);
    return $results;
}

function extractMetadata($url, $html) {
    $meta = ['title'=>'','description'=>'','thumbnail'=>'','favicon'=>'','author'=>'','tags'=>''];

    // Title
    if (preg_match('/<title[^>]*>([^<]+)<\/title>/i', $html, $m)) $meta['title'] = trim(html_entity_decode($m[1], ENT_QUOTES, 'UTF-8'));
    if (empty($meta['title']) && preg_match('/property="og:title"\s+content="([^"]+)"/i', $html, $m)) $meta['title'] = trim(html_entity_decode($m[1], ENT_QUOTES, 'UTF-8'));

    // Description
    if (preg_match('/name="description"\s+content="([^"]+)"/i', $html, $m)) $meta['description'] = trim(html_entity_decode($m[1], ENT_QUOTES, 'UTF-8'));
    if (empty($meta['description']) && preg_match('/property="og:description"\s+content="([^"]+)"/i', $html, $m)) $meta['description'] = trim(html_entity_decode($m[1], ENT_QUOTES, 'UTF-8'));
    if (empty($meta['description'])) {
        $cleanBody = preg_replace('/<(script|style|noscript)[^>]*>.*?<\/\1>/si', '', $html);
        $bodyText = trim(preg_replace('/\s+/', ' ', strip_tags($cleanBody)));
        $meta['description'] = mb_substr($bodyText, 0, 300);
    }

    // Thumbnail (multiple sources)
    $thumbPatterns = [
        '/property="og:image"\s+content="([^"]+)"/i',
        '/name="twitter:image"\s+content="([^"]+)"/i',
        '/property="og:image:url"\s+content="([^"]+)"/i',
        '/<link[^>]+rel="image_src"[^>]+href="([^"]+)"/i',
        '/<img[^>]+class="[^"]*featured[^"]*"[^>]+src="([^"]+)"/i',
        '/<img[^>]+src="([^"]+)"[^>]+class="[^"]*featured/i',
        '/<article[^>]*>.*?<img[^>]+src="([^"]+)"/si',
    ];
    foreach ($thumbPatterns as $p) {
        if (preg_match($p, $html, $m)) {
            $thumb = $m[1];
            if (!preg_match('/^https?:\/\//i', $thumb)) $thumb = normalizeUrl($thumb, $url);
            if ($thumb && strlen($thumb) > 10 && !preg_match('/logo|icon|avatar|sprite|placeholder|1x1|blank\./i', $thumb)) {
                $meta['thumbnail'] = $thumb;
                break;
            }
        }
    }

    // Favicon
    if (preg_match('/<link[^>]+rel="[^"]*icon[^"]*"[^>]+href="([^"]+)"/i', $html, $m)) {
        $fav = $m[1];
        if (!preg_match('/^https?:\/\//i', $fav)) $fav = normalizeUrl($fav, $url);
        $meta['favicon'] = $fav ?: '';
    }
    if (empty($meta['favicon'])) {
        $host = parse_url($url, PHP_URL_HOST);
        $meta['favicon'] = 'https://' . $host . '/favicon.ico';
    }

    // Author
    if (preg_match('/name="author"\s+content="([^"]+)"/i', $html, $m)) $meta['author'] = trim($m[1]);
    if (empty($meta['author']) && preg_match('/class="[^"]*author[^"]*"[^>]*>([^<]+)</i', $html, $m)) $meta['author'] = trim(strip_tags($m[1]));

    // Tags
    if (preg_match('/name="keywords"\s+content="([^"]+)"/i', $html, $m)) $meta['tags'] = trim($m[1]);
    if (empty($meta['tags']) && !empty($meta['title'])) $meta['tags'] = extractTagsFromTitle($meta['title']);

    return $meta;
}

function normalizeUrl($url, $baseUrl) {
    if (empty($url)) return '';
    $url = trim($url);
    if (preg_match('/^(javascript:|mailto:|tel:|#|data:)/i', $url)) return '';
    if (preg_match('/^\/\//', $url)) return 'https:' . $url;
    if (preg_match('/^\//', $url)) {
        $parsed = parse_url($baseUrl);
        return ($parsed['scheme'] ?? 'https') . '://' . ($parsed['host'] ?? '') . $url;
    }
    if (!preg_match('/^https?:\/\//i', $url)) {
        $base = preg_replace('/\/[^\/]*$/', '/', $baseUrl);
        return $base . $url;
    }
    return $url;
}

function scoreContentQuality($url, $html, $meta) {
    $score = 50;
    $title = $meta['title'] ?? '';
    if (strlen($title) > 20) $score += 10;
    if (strlen($title) < 5) $score -= 20;
    $bodyLen = strlen(strip_tags(preg_replace('/<(script|style|noscript)[^>]*>.*?<\/\1>/si', '', $html)));
    if ($bodyLen > 2000) $score += 15;
    if ($bodyLen > 500) $score += 5;
    if ($bodyLen < 100) $score -= 30;
    if (!empty($meta['description']) && strlen($meta['description']) > 50) $score += 5;
    if (!empty($meta['thumbnail'])) $score += 5;
    if (preg_match('/\/post[s]?\/|\/article|\/blog\/|\/news\/|\d{4}\/\d{2}\//i', $url)) $score += 10;
    if (preg_match('/\/tag\/|\/category\/|\/page\/\d|\/search\?|\/login|\/register/i', $url)) $score -= 15;
    if (preg_match('/<article|<main|role="main"/i', $html)) $score += 5;
    $noindex = hasNoindexNofollow($html);
    if ($noindex['noindex']) $score -= 30;
    return min(100, max(0, $score));
}

function calculateRelevanceScore($url, $meta, $keywords) {
    if (empty($keywords)) return 10;
    $score = 0;
    $title = strtolower($meta['title'] ?? '');
    $desc = strtolower($meta['description'] ?? '');
    $urlLower = strtolower($url);
    foreach ($keywords as $kw) {
        $kw = strtolower(trim($kw));
        if (empty($kw) || preg_match('/^https?:\/\//i', $kw)) continue;
        $words = explode(' ', $kw);
        foreach ($words as $word) {
            if (strlen($word) < 3) continue;
            if (stripos($title, $word) !== false) $score += 4;
            if (stripos($desc, $word) !== false) $score += 3;
            if (stripos($urlLower, $word) !== false) $score += 2;
        }
        if (stripos($title, $kw) !== false) $score += 5;
    }
    return min(100, $score);
}

function hasNoindexNofollow($html) {
    $result = ['noindex' => false, 'nofollow' => false];
    if (preg_match('/<meta[^>]+name="robots"[^>]+content="([^"]+)"/i', $html, $m)) {
        $content = strtolower($m[1]);
        if (strpos($content, 'noindex') !== false) $result['noindex'] = true;
        if (strpos($content, 'nofollow') !== false) $result['nofollow'] = true;
    }
    return $result;
}

function getCanonicalUrl($html, $currentUrl) {
    if (preg_match('/<link[^>]+rel="canonical"[^>]+href="([^"]+)"/i', $html, $m)) {
        $canonical = trim($m[1]);
        if (!preg_match('/^https?:\/\//i', $canonical)) $canonical = normalizeUrl($canonical, $currentUrl);
        return $canonical;
    }
    return null;
}

function extractAllLinks($html, $baseUrl, $maxLinks = 50, $includeTerms = [], $excludeTerms = [], $config = []) {
    $links = [];
    preg_match_all('/<a[^>]+href="([^"]+)"/i', $html, $matches);
    $skipPatterns = '/login|signin|signup|register|admin|wp-admin|dashboard|cart|checkout|account|profile|settings|password|logout|api\/|xmlrpc|mailto:|javascript:|tel:|#respond|\/feed\/?$|\/wp-json\//i';
    $mediaExts = ['jpg','jpeg','png','gif','webp','svg','bmp','ico','mp4','avi','webm','mov','mp3','wav','pdf','doc','docx','xls','zip','rar','7z','tar','gz'];
    $baseDomain = parse_url($baseUrl, PHP_URL_HOST) ?? '';
    $followExternal = $config['follow_external'] ?? true;
    $postsFirst = !empty($config['posts_first']);
    $postPatterns = '/\/post[s]?\/|\/article[s]?\/|\/blog\/|\/news\/|\/story\/|\d{4}\/\d{2}\/|\.html?$/i';
    $postLinks = [];
    $otherLinks = [];

    foreach ($matches[1] as $link) {
        $normalized = normalizeUrl($link, $baseUrl);
        if (!$normalized || strlen($normalized) < 15) continue;
        if (preg_match($skipPatterns, $normalized)) continue;
        $linkHost = strtolower(parse_url($normalized, PHP_URL_HOST) ?? '');
        if (isDomainSkipped($linkHost)) continue;
        if (!$followExternal && $linkHost !== $baseDomain) continue;
        $ext = strtolower(pathinfo(parse_url($normalized, PHP_URL_PATH) ?? '', PATHINFO_EXTENSION));
        if (in_array($ext, $mediaExts)) continue;
        if (substr_count(parse_url($normalized, PHP_URL_QUERY) ?? '', '&') > 3) continue;
        if (!empty($excludeTerms)) {
            $skip = false;
            foreach ($excludeTerms as $term) { if (!empty($term) && stripos($normalized, $term) !== false) { $skip = true; break; } }
            if ($skip) continue;
        }
        if ($postsFirst && preg_match($postPatterns, $normalized)) { $postLinks[] = $normalized; }
        else { $otherLinks[] = $normalized; }
    }
    return array_unique(array_slice(array_merge($postLinks, $otherLinks), 0, $maxLinks));
}

function extractSearchResultsFromUrl($url, $html) {
    $links = [];
    // Google
    if (preg_match('/google\./i', $url)) {
        preg_match_all('/href="\/url\?q=([^&"]+)/i', $html, $m);
        foreach ($m[1] as $u) { $u = urldecode($u); if (filter_var($u, FILTER_VALIDATE_URL) && !preg_match('/google\./i', $u)) $links[] = $u; }
    }
    // Bing
    if (preg_match('/bing\./i', $url)) {
        preg_match_all('/<a[^>]+href="(https?:\/\/[^"]+)"/i', $html, $m);
        foreach ($m[1] as $u) { if (!preg_match('/bing\.|microsoft\.|msn\./i', $u)) $links[] = $u; }
    }
    // DuckDuckGo
    if (preg_match('/duckduckgo\./i', $url)) {
        preg_match_all('/uddg=([^&"]+)/i', $html, $m);
        foreach ($m[1] as $u) { $u = urldecode($u); if (filter_var($u, FILTER_VALIDATE_URL)) $links[] = $u; }
    }
    // Yahoo
    if (preg_match('/yahoo\./i', $url)) {
        preg_match_all('/RU=([^\/]+)\/RK/i', $html, $m);
        foreach ($m[1] as $u) { $u = urldecode($u); if (filter_var($u, FILTER_VALIDATE_URL)) $links[] = $u; }
    }
    // Generic fallback
    if (empty($links)) {
        preg_match_all('/<a[^>]+href="(https?:\/\/[^"]+)"/i', $html, $m);
        foreach ($m[1] as $u) { if (!preg_match('/google\.|bing\.|yahoo\.|yandex\.|duckduckgo\.|baidu\./i', $u) && filter_var($u, FILTER_VALIDATE_URL)) $links[] = $u; }
    }
    return array_unique(array_slice($links, 0, 50));
}

function extractEmbedCode($url) {
    $host = parse_url($url, PHP_URL_HOST);
    if (!$host) return '';
    $host = strtolower(preg_replace('/^www\./', '', $host));
    if (in_array($host, ['youtube.com','m.youtube.com','youtu.be'])) {
        $videoId = null;
        if ($host === 'youtu.be') { $videoId = ltrim(parse_url($url, PHP_URL_PATH), '/'); }
        else { parse_str(parse_url($url, PHP_URL_QUERY) ?? '', $q); $videoId = $q['v'] ?? null; if (!$videoId && preg_match('#/(shorts|embed|v)/([a-zA-Z0-9_-]+)#', $url, $m)) $videoId = $m[2]; }
        if ($videoId) return '<iframe src="https://www.youtube.com/embed/'.htmlspecialchars($videoId).'" width="560" height="315" frameborder="0" allowfullscreen></iframe>';
    }
    if (in_array($host, ['vimeo.com','player.vimeo.com'])) {
        if (preg_match('#/(\d+)#', parse_url($url, PHP_URL_PATH), $m)) return '<iframe src="https://player.vimeo.com/video/'.$m[1].'" width="560" height="315" frameborder="0" allowfullscreen></iframe>';
    }
    if (strpos($host, 'tiktok.com') !== false) {
        if (preg_match('#/video/(\d+)#', $url, $m)) return '<iframe src="https://www.tiktok.com/embed/'.$m[1].'" width="560" height="315" frameborder="0" allowfullscreen></iframe>';
    }
    if ($host === 'soundcloud.com') return '<iframe width="100%" height="166" scrolling="no" frameborder="no" src="https://w.soundcloud.com/player/?url='.urlencode($url).'"></iframe>';
    if (strpos($host, 'spotify.com') !== false) {
        if (preg_match('#/(track|album|playlist|episode)/([a-zA-Z0-9]+)#', $url, $m)) return '<iframe src="https://open.spotify.com/embed/'.$m[1].'/'.$m[2].'" width="300" height="380" frameborder="0"></iframe>';
    }
    return '';
}

function extractTagsFromTitle($title) {
    if (empty($title)) return '';
    $title = preg_replace("/[.,\/#!$%\^&\*;:{}=\-_`~()\[\]\"']/", " ", $title);
    $title = mb_strtolower($title);
    $words = preg_split('/\s+/', trim($title));
    $commonWords = ['a','an','the','and','or','but','in','at','on','with','to','for','is','of','that','it','by','from','as','are','was','be','has','have','will','this','which','its','about','up','more','who','also','they','out','he','she','you','their','we','her','his','them','been','these','would','some','can','like','there','if','all','my','what','so','then','into','just','over','do','than','when','other','how','our','any','new','me','not','no','very','much','get','got','one','two'];
    $keywords = array_filter($words, function($w) use ($commonWords) { return mb_strlen($w) > 2 && !in_array($w, $commonWords) && !is_numeric($w); });
    return mb_substr(implode(', ', array_values(array_unique($keywords))), 0, 500);
}

function generateRandomAuthor() {
    $authors = ['Alex Morgan','Sarah Chen','David Kim','Maria Santos','James Wilson','Emma Davis','Carlos Rivera','Yuki Tanaka','Hassan Ali','Priya Patel','Lucas Martin','Sofia Rossi','Andre Laurent','Nina Kovacs','Marco Bianchi','Elena Petrova','Ryan OBrien','Aisha Mbeki','Felix Wagner','Maya Johansson','SilverFox','NightOwl','StormRider','CyberNinja','PixelHeart','DarkPhoenix','CosmicRay','NeonDream','SolarFlare','MoonShadow','John Smith','Maria Garcia','David Lee','Sarah Kim','Michael Johnson','Jennifer Brown','Robert Taylor','Lisa Anderson','William Thomas','Nancy Jackson','Harper James','Jasmine Cloud','Felix Storm','Ruby Ocean','Zephyr Moon','Sage Autumn','River Dawn','Phoenix Ember','Aurora Night','Orion Sky'];
    return $authors[array_rand($authors)];
}

// ============================================
// BLOCO 8: API HANDLERS
// ============================================

$action = $_GET['action'] ?? '';
$workerType = $_GET['worker'] ?? '';

// Worker entry points (called via background HTTP requests)
if ($workerType === 'discovery' && !empty($_GET['pid'])) {
    header('Content-Type: text/plain');
    $pid = $_GET['pid'];
    $configJson = base64_decode($_GET['cfg'] ?? '');
    $cfg = json_decode($configJson, true) ?: [];
    $kws = $cfg['keywords'] ?? [];
    runDiscoveryWorker($pid, $kws, $cfg, $pdo);
    echo "DONE";
    exit;
}

if ($workerType === 'processor' && !empty($_GET['pid'])) {
    header('Content-Type: text/plain');
    $pid = $_GET['pid'];
    $configJson = base64_decode($_GET['cfg'] ?? '');
    $cfg = json_decode($configJson, true) ?: [];
    $kws = $cfg['keywords'] ?? [];
    runProcessorWorker($pid, $kws, $cfg, $pdo);
    echo "DONE";
    exit;
}

if ($action === 'create_pipeline') {
    header('Content-Type: application/json');

    // Parse JSON body
    $contentType = $_SERVER['CONTENT_TYPE'] ?? $_SERVER['HTTP_CONTENT_TYPE'] ?? '';
    $input = [];
    if (stripos($contentType, 'application/json') !== false) {
        $input = json_decode(file_get_contents('php://input'), true) ?: [];
    } else {
        $input = $_POST;
    }

    $keywords = array_filter(array_map('trim', explode("\n", $input['keywords'] ?? '')));
    if (empty($keywords)) { echo json_encode(['error'=>'No keywords']); exit; }

    $discoveryCount = max(1, min(10, (int)($input['discovery_workers'] ?? 3)));
    $processorCount = max(1, min(8, (int)($input['processor_workers'] ?? 2)));
    $discoveryOnly = !empty($input['discovery_only']);
    $processorOnly = !empty($input['processor_only']);

    // Build config array
    $config = [
        'keywords' => $keywords,
        'wave_size' => (int)($input['wave_size'] ?? 50),
        'max_discovered' => (int)($input['max_discovered'] ?? 100000),
        'max_queue_depth' => (int)($input['max_queue_depth'] ?? 5000),
        'batch_size' => (int)($input['batch_size'] ?? 20),
        'quality_threshold' => (int)($input['quality_threshold'] ?? 20),
        'relevance_threshold' => (int)($input['relevance_threshold'] ?? 2),
        'max_imports' => (int)($input['max_imports'] ?? 1000000),
        'links_per_page' => (int)($input['links_per_page'] ?? 150),
        'max_depth' => (int)($input['max_depth'] ?? 5),
        'fetch_delay' => (int)($input['fetch_delay'] ?? 0),
        'pre_fetch' => !empty($input['pre_fetch']),
        'follow_external' => isset($input['follow_external']) ? !empty($input['follow_external']) : true,
        'posts_first' => isset($input['posts_first']) ? !empty($input['posts_first']) : true,
        'enable_pagination' => isset($input['enable_pagination']) ? !empty($input['enable_pagination']) : true,
        'url_pattern' => $input['url_pattern'] ?? '',
        'http_user' => $input['http_user'] ?? '',
        'http_pass' => $input['http_pass'] ?? '',
        'include_terms' => array_filter(array_map('trim', explode("\n", $input['include_terms'] ?? ''))),
        'exclude_terms' => array_filter(array_map('trim', explode("\n", $input['exclude_terms'] ?? ''))),
        'forced_domains' => array_filter(array_map('trim', explode("\n", $input['forced_domains'] ?? ''))),
    ];

    // Engine config
    $engines = ['searxng','google','bing','bing_regional','duckduckgo','yahoo','yandex','brave','wikipedia','baidu','direct'];
    foreach ($engines as $eng) {
        $config['engine_' . $eng] = !empty($input['engine_' . $eng]);
    }

    // Launch workers via self-calling HTTP requests
    $processIds = [];
    $selfUrl = (isset($_SERVER['HTTPS']) && $_SERVER['HTTPS'] === 'on' ? 'https' : 'http') . '://' . $_SERVER['HTTP_HOST'] . strtok($_SERVER['REQUEST_URI'], '?');
    $cfgB64 = base64_encode(json_encode($config));

    // Launch discovery workers
    if (!$processorOnly) {
        for ($i = 0; $i < $discoveryCount; $i++) {
            $pid = mpCreateProcess('DIS');
            $workerUrl = $selfUrl . '?worker=discovery&pid=' . urlencode($pid) . '&cfg=' . urlencode($cfgB64);
            $processIds[] = ['id' => $pid, 'type' => 'discovery'];
            // Fire and forget HTTP request
            launchBackgroundWorker($workerUrl);
        }
    }

    // Launch processor workers
    if (!$discoveryOnly) {
        for ($i = 0; $i < $processorCount; $i++) {
            $pid = mpCreateProcess('PRC');
            $workerUrl = $selfUrl . '?worker=processor&pid=' . urlencode($pid) . '&cfg=' . urlencode($cfgB64);
            $processIds[] = ['id' => $pid, 'type' => 'processor'];
            launchBackgroundWorker($workerUrl);
        }
    }

    echo json_encode(['success' => true, 'processes' => $processIds, 'discovery' => $discoveryCount, 'processor' => $processorCount]);
    exit;
}

function launchBackgroundWorker($url) {
    $ch = curl_init();
    curl_setopt_array($ch, [
        CURLOPT_URL => $url,
        CURLOPT_RETURNTRANSFER => true,
        CURLOPT_TIMEOUT_MS => 500,  // Just trigger, don't wait
        CURLOPT_NOSIGNAL => 1,
        CURLOPT_SSL_VERIFYPEER => false,
        CURLOPT_FOLLOWLOCATION => true,
    ]);
    curl_exec($ch);
    curl_close($ch);
}

if ($action === 'control_pipeline') {
    header('Content-Type: application/json');
    $contentType = $_SERVER['CONTENT_TYPE'] ?? '';
    $input = [];
    if (stripos($contentType, 'application/json') !== false) {
        $input = json_decode(file_get_contents('php://input'), true) ?: [];
    } else { $input = $_POST; }

    $cmd = $input['command'] ?? '';
    if (!in_array($cmd, ['pause','resume','stop'])) { echo json_encode(['error'=>'Invalid command']); exit; }

    // Apply to all active pipeline processes
    try {
        $statusMap = ['pause'=>'paused','resume'=>'running','stop'=>'stopped'];
        $newStatus = $statusMap[$cmd];
        $stmt = $pdo->prepare("UPDATE pipeline_processes SET status=? WHERE mode='pipeline' AND status IN ('running','paused')");
        $stmt->execute([$newStatus]);
        $affected = $stmt->rowCount();

        // If stopping, release all claimed URLs
        if ($cmd === 'stop') {
            $pdo->exec("UPDATE crawler_ready SET status='pending', process_id=NULL, claimed_at=NULL WHERE status='claimed'");
            $pdo->exec("UPDATE crawler_pool SET status='pending', process_id=NULL, claimed_at=NULL WHERE status='claimed'");
        }
        echo json_encode(['success'=>true, 'affected'=>$affected, 'command'=>$cmd]);
    } catch (Exception $e) { echo json_encode(['error'=>$e->getMessage()]); }
    exit;
}

if ($action === 'stream_pipeline') {
    header('Content-Type: text/event-stream');
    header('Cache-Control: no-cache');
    header('Connection: keep-alive');
    header('X-Accel-Buffering: no');

    while (true) {
        if (connection_aborted()) break;

        // Get all pipeline processes
        try {
            $processes = $pdo->query("SELECT id, worker_mode, status, stats, last_heartbeat, created_at FROM pipeline_processes WHERE mode='pipeline' ORDER BY created_at DESC")->fetchAll();
        } catch (Exception $e) { $processes = []; }

        $discoveryStats = ['workers'=>0,'discovered'=>0,'queued'=>0,'rate'=>0,'waves'=>0,'status'=>'idle'];
        $processorStats = ['workers'=>0,'processed'=>0,'imported'=>0,'duplicates'=>0,'errors'=>0,'skipped'=>0,'avg_ms'=>0,'status'=>'idle'];
        $activeWorkers = 0;

        foreach ($processes as $p) {
            $s = json_decode($p['stats'] ?? '{}', true) ?: [];
            if ($p['worker_mode'] === 'discovery' && in_array($p['status'], ['running','paused'])) {
                $discoveryStats['workers']++;
                $discoveryStats['discovered'] += $s['discovered'] ?? 0;
                $discoveryStats['queued'] += $s['queued'] ?? 0;
                $discoveryStats['waves'] += $s['waves'] ?? 0;
                $discoveryStats['rate'] += $s['rate'] ?? 0;
                if (($s['status'] ?? '') === 'backpressure') $discoveryStats['status'] = 'backpressure';
                elseif ($p['status'] === 'running') $discoveryStats['status'] = 'running';
                $activeWorkers++;
            }
            if ($p['worker_mode'] === 'processor' && in_array($p['status'], ['running','paused'])) {
                $processorStats['workers']++;
                $processorStats['processed'] += $s['processed'] ?? 0;
                $processorStats['imported'] += $s['imported'] ?? 0;
                $processorStats['duplicates'] += $s['duplicates'] ?? 0;
                $processorStats['errors'] += $s['errors'] ?? 0;
                $processorStats['skipped'] += $s['skipped'] ?? 0;
                $processorStats['avg_ms'] = max($processorStats['avg_ms'], $s['avg_ms'] ?? 0);
                if ($p['status'] === 'running') $processorStats['status'] = 'running';
                $activeWorkers++;
            }
        }

        $queueDepth = mpGetQueueDepth($pdo);

        // Get recent logs
        try {
            $logs = $pdo->query("SELECT process_id, entries, created_at FROM crawler_logs WHERE created_at > DATE_SUB(NOW(), INTERVAL 5 MINUTE) ORDER BY created_at DESC LIMIT 20")->fetchAll();
        } catch (Exception $e) { $logs = []; }

        $logEntries = [];
        foreach ($logs as $log) {
            $entries = json_decode($log['entries'] ?? '[]', true) ?: [];
            foreach ($entries as $entry) {
                $logEntries[] = array_merge($entry, ['process_id'=>$log['process_id'], 'time'=>$log['created_at']]);
            }
        }

        $data = [
            'discovery' => $discoveryStats,
            'processor' => $processorStats,
            'queue_depth' => $queueDepth,
            'active_workers' => $activeWorkers,
            'logs' => array_slice($logEntries, 0, 20),
            'timestamp' => time()
        ];

        echo "data: " . json_encode($data) . "\n\n";
        ob_flush(); flush();

        if ($activeWorkers === 0 && count($processes) > 0) {
            // All workers finished
            echo "data: " . json_encode(array_merge($data, ['finished'=>true])) . "\n\n";
            ob_flush(); flush();
            break;
        }

        sleep(2);
    }
    exit;
}

if ($action === 'cleanup_pipeline') {
    header('Content-Type: application/json');
    try {
        $pdo->exec("DELETE FROM pipeline_processes WHERE mode='pipeline'");
        $pdo->exec("DELETE FROM crawler_ready WHERE status IN ('done','failed')");
        $pdo->exec("DELETE FROM crawler_pool WHERE status IN ('done','error')");
        $pdo->exec("DELETE FROM crawler_logs WHERE created_at < DATE_SUB(NOW(), INTERVAL 1 DAY)");
        echo json_encode(['success'=>true,'message'=>'Pipeline cleaned up']);
    } catch (Exception $e) { echo json_encode(['error'=>$e->getMessage()]); }
    exit;
}

if ($action === 'migrate_schema') {
    header('Content-Type: application/json');
    try {
        $stmt = $pdo->query("SHOW COLUMNS FROM pipeline_processes LIKE 'worker_mode'");
        if ($stmt->rowCount() === 0) {
            $pdo->exec("ALTER TABLE pipeline_processes ADD COLUMN worker_mode VARCHAR(20) DEFAULT 'discovery' AFTER mode");
            echo json_encode(['success'=>true,'message'=>'Column worker_mode added successfully']);
        } else {
            echo json_encode(['success'=>true,'message'=>'Column worker_mode already exists']);
        }
    } catch (Exception $e) { echo json_encode(['error'=>$e->getMessage()]); }
    exit;
}

if ($action === 'pipeline_status') {
    header('Content-Type: application/json');
    try {
        $processes = $pdo->query("SELECT id, worker_mode, status, stats, last_heartbeat FROM pipeline_processes WHERE mode='pipeline' AND status IN ('running','paused') ORDER BY created_at DESC")->fetchAll();
        $queueDepth = mpGetQueueDepth($pdo);
        echo json_encode(['processes'=>$processes, 'queue_depth'=>$queueDepth]);
    } catch (Exception $e) { echo json_encode(['error'=>$e->getMessage()]); }
    exit;
}

// If no action, show HTML page
if (!empty($action)) { header('Content-Type: application/json'); echo json_encode(['error'=>'Unknown action']); exit; }

// ============================================
// BLOCO 9-11: HTML + CSS + JAVASCRIPT
// ============================================
?>
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Crawler Pipeline - Producer/Consumer</title>
<script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.0/dist/chart.umd.min.js"></script>
<style>
:root {
    --bg-primary: #0a0e17;
    --bg-secondary: #111827;
    --bg-card: #1a2332;
    --bg-input: #0d1520;
    --text-primary: #e2e8f0;
    --text-secondary: #94a3b8;
    --border: #2a3a4a;
    --accent: #3b82f6;
    --accent-hover: #2563eb;
    --success: #10b981;
    --warning: #f59e0b;
    --danger: #ef4444;
    --purple: #8b5cf6;
    --cyan: #06b6d4;
}
* { margin:0; padding:0; box-sizing:border-box; }
body { background:var(--bg-primary); color:var(--text-primary); font-family:-apple-system,BlinkMacSystemFont,'Segoe UI',sans-serif; min-height:100vh; }
.container { max-width:1400px; margin:0 auto; padding:1rem; }
.header { display:flex; justify-content:space-between; align-items:center; padding:1rem 0; border-bottom:1px solid var(--border); margin-bottom:1.5rem; }
.header h1 { font-size:1.5rem; background:linear-gradient(135deg, var(--accent), var(--purple)); -webkit-background-clip:text; -webkit-text-fill-color:transparent; }
.header .badge { background:var(--bg-card); border:1px solid var(--border); padding:0.3rem 0.8rem; border-radius:20px; font-size:0.75rem; color:var(--text-secondary); }

/* Config Panel */
.config-panel { display:grid; grid-template-columns:1fr 1fr; gap:1.5rem; }
.config-section { background:var(--bg-card); border:1px solid var(--border); border-radius:12px; padding:1.2rem; }
.config-section h3 { font-size:0.85rem; color:var(--accent); margin-bottom:0.8rem; text-transform:uppercase; letter-spacing:1px; }
.form-group { margin-bottom:0.8rem; }
.form-group label { display:block; font-size:0.75rem; color:var(--text-secondary); margin-bottom:0.3rem; }
.form-group input[type="text"], .form-group input[type="number"], .form-group input[type="password"], .form-group textarea {
    width:100%; background:var(--bg-input); border:1px solid var(--border); border-radius:6px; padding:0.5rem 0.7rem; color:var(--text-primary); font-size:0.8rem; }
.form-group textarea { min-height:60px; resize:vertical; font-family:monospace; }
.form-group input:focus, .form-group textarea:focus { outline:none; border-color:var(--accent); }
.checkbox-grid { display:grid; grid-template-columns:repeat(3, 1fr); gap:0.4rem; }
.checkbox-item { display:flex; align-items:center; gap:0.3rem; font-size:0.75rem; color:var(--text-secondary); }
.checkbox-item input[type="checkbox"] { accent-color:var(--accent); }
.slider-group { display:flex; align-items:center; gap:0.5rem; }
.slider-group input[type="range"] { flex:1; accent-color:var(--accent); }
.slider-group .value { min-width:30px; text-align:center; font-size:0.8rem; color:var(--accent); font-weight:600; }

.btn { padding:0.6rem 1.2rem; border:none; border-radius:8px; cursor:pointer; font-size:0.85rem; font-weight:600; transition:all 0.2s; }
.btn-primary { background:linear-gradient(135deg, var(--accent), var(--purple)); color:white; }
.btn-primary:hover { transform:translateY(-1px); box-shadow:0 4px 12px rgba(59,130,246,0.4); }
.btn-warning { background:var(--warning); color:#000; }
.btn-danger { background:var(--danger); color:white; }
.btn-success { background:var(--success); color:white; }
.btn-sm { padding:0.4rem 0.8rem; font-size:0.75rem; }
.launch-bar { display:flex; justify-content:center; gap:1rem; margin-top:1.5rem; padding:1rem; }

/* Dashboard */
.dashboard { display:none; }
.pipeline-viz { display:grid; grid-template-columns:1fr auto 1fr; gap:1rem; margin-bottom:1.5rem; align-items:stretch; }
.pipeline-col { background:var(--bg-card); border:1px solid var(--border); border-radius:12px; padding:1rem; }
.pipeline-col h3 { font-size:0.8rem; text-transform:uppercase; letter-spacing:1px; margin-bottom:0.8rem; display:flex; align-items:center; gap:0.5rem; }
.pipeline-col.discovery h3 { color:var(--cyan); }
.pipeline-col.queue h3 { color:var(--warning); }
.pipeline-col.processor h3 { color:var(--success); }

.queue-viz { display:flex; flex-direction:column; align-items:center; justify-content:center; min-width:120px; }
.queue-bar { width:40px; height:200px; background:var(--bg-input); border-radius:20px; position:relative; overflow:hidden; border:2px solid var(--border); }
.queue-fill { position:absolute; bottom:0; left:0; right:0; background:linear-gradient(to top, var(--warning), var(--danger)); border-radius:0 0 18px 18px; transition:height 0.5s ease; }
.queue-label { margin-top:0.5rem; font-size:0.75rem; color:var(--text-secondary); text-align:center; }
.queue-count { font-size:1.2rem; font-weight:700; color:var(--warning); margin-top:0.3rem; }

.stat-grid { display:grid; grid-template-columns:repeat(2, 1fr); gap:0.5rem; }
.stat-item { background:var(--bg-input); border-radius:8px; padding:0.6rem; text-align:center; }
.stat-item .value { font-size:1.1rem; font-weight:700; color:var(--text-primary); }
.stat-item .label { font-size:0.65rem; color:var(--text-secondary); margin-top:0.2rem; }
.stat-item.highlight .value { color:var(--accent); }

.status-badge { display:inline-block; padding:0.2rem 0.5rem; border-radius:10px; font-size:0.7rem; font-weight:600; }
.status-badge.running { background:rgba(16,185,129,0.2); color:var(--success); }
.status-badge.paused { background:rgba(245,158,11,0.2); color:var(--warning); }
.status-badge.backpressure { background:rgba(239,68,68,0.2); color:var(--danger); animation:pulse 1.5s infinite; }
.status-badge.finished { background:rgba(59,130,246,0.2); color:var(--accent); }
.status-badge.idle { background:rgba(148,163,184,0.2); color:var(--text-secondary); }

@keyframes pulse { 0%,100% { opacity:1; } 50% { opacity:0.5; } }

.charts-row { display:grid; grid-template-columns:1fr 1fr; gap:1rem; margin-bottom:1.5rem; }
.chart-card { background:var(--bg-card); border:1px solid var(--border); border-radius:12px; padding:1rem; }
.chart-card h4 { font-size:0.75rem; color:var(--text-secondary); margin-bottom:0.5rem; }
.chart-card canvas { max-height:200px; }

.controls-bar { display:flex; gap:0.5rem; align-items:center; justify-content:center; margin-bottom:1rem; padding:0.8rem; background:var(--bg-card); border-radius:12px; border:1px solid var(--border); }

.activity-log { background:var(--bg-card); border:1px solid var(--border); border-radius:12px; padding:1rem; max-height:300px; overflow-y:auto; }
.activity-log h4 { font-size:0.8rem; color:var(--text-secondary); margin-bottom:0.5rem; }
.log-entry { padding:0.3rem 0.5rem; border-bottom:1px solid rgba(42,58,74,0.5); font-size:0.7rem; display:flex; gap:0.5rem; align-items:center; }
.log-entry .time { color:var(--text-secondary); min-width:45px; }
.log-entry .domain { color:var(--purple); min-width:100px; overflow:hidden; text-overflow:ellipsis; white-space:nowrap; }
.log-entry .title { color:var(--text-primary); flex:1; overflow:hidden; text-overflow:ellipsis; white-space:nowrap; }
.log-entry.imported .status-dot { width:6px; height:6px; border-radius:50%; background:var(--success); }
.log-entry.error .status-dot { width:6px; height:6px; border-radius:50%; background:var(--danger); }
.log-entry.skipped .status-dot { width:6px; height:6px; border-radius:50%; background:var(--warning); }

.config-panel.hidden { display:none; }
.dashboard.visible { display:block; }

@media (max-width: 768px) {
    .config-panel { grid-template-columns:1fr; }
    .pipeline-viz { grid-template-columns:1fr; }
    .charts-row { grid-template-columns:1fr; }
    .checkbox-grid { grid-template-columns:repeat(2, 1fr); }
}
</style>
</head>
<body>
<div class="container">
    <div class="header">
        <h1>Crawler Pipeline</h1>
        <span class="badge">Producer → Queue → Consumer</span>
    </div>

    <!-- CONFIG PANEL -->
    <div class="config-panel" id="configPanel">
        <div class="config-section" style="grid-column:span 2">
            <h3>Keywords & Seed URLs</h3>
            <div class="form-group">
                <label>Keywords (one per line) or Seed URLs</label>
                <textarea id="cfgKeywords" rows="4" placeholder="technology news&#10;artificial intelligence&#10;https://example.com/blog"></textarea>
            </div>
        </div>

        <div class="config-section">
            <h3>Workers</h3>
            <div class="form-group">
                <label>Discovery Workers</label>
                <div class="slider-group">
                    <input type="range" id="cfgDiscovery" min="1" max="10" value="3" oninput="document.getElementById('valDiscovery').textContent=this.value">
                    <span class="value" id="valDiscovery">3</span>
                </div>
            </div>
            <div class="form-group">
                <label>Processor Workers</label>
                <div class="slider-group">
                    <input type="range" id="cfgProcessor" min="1" max="8" value="2" oninput="document.getElementById('valProcessor').textContent=this.value">
                    <span class="value" id="valProcessor">2</span>
                </div>
            </div>
            <div class="checkbox-grid">
                <label class="checkbox-item"><input type="checkbox" id="cfgDiscoveryOnly"> Discovery Only</label>
                <label class="checkbox-item"><input type="checkbox" id="cfgProcessorOnly"> Processor Only</label>
                <label class="checkbox-item"><input type="checkbox" id="cfgPreFetch"> Pre-fetch HTML</label>
            </div>
        </div>

        <div class="config-section">
            <h3>Discovery Settings</h3>
            <div class="form-group">
                <label>Wave Size</label>
                <input type="number" id="cfgWaveSize" value="50" min="5" max="200">
            </div>
            <div class="form-group">
                <label>Max Queue Depth</label>
                <input type="number" id="cfgMaxQueue" value="5000" min="100" max="50000">
            </div>
            <div class="form-group">
                <label>Max Discovered URLs</label>
                <input type="number" id="cfgMaxDiscovered" value="100000" min="100">
            </div>
        </div>

        <div class="config-section">
            <h3>Processor Settings</h3>
            <div class="form-group">
                <label>Batch Size</label>
                <input type="number" id="cfgBatchSize" value="20" min="5" max="100">
            </div>
            <div class="form-group">
                <label>Quality Threshold (0-100)</label>
                <input type="number" id="cfgQuality" value="20" min="0" max="100">
            </div>
            <div class="form-group">
                <label>Relevance Threshold (0-100)</label>
                <input type="number" id="cfgRelevance" value="2" min="0" max="100">
            </div>
            <div class="form-group">
                <label>Max Imports</label>
                <input type="number" id="cfgMaxImports" value="1000000" min="10">
            </div>
        </div>

        <div class="config-section">
            <h3>Search Engines</h3>
            <div class="checkbox-grid">
                <label class="checkbox-item"><input type="checkbox" id="engSearxng" checked> SearXNG</label>
                <label class="checkbox-item"><input type="checkbox" id="engGoogle" checked> Google</label>
                <label class="checkbox-item"><input type="checkbox" id="engBing" checked> Bing</label>
                <label class="checkbox-item"><input type="checkbox" id="engBingRegional"> Bing Regional</label>
                <label class="checkbox-item"><input type="checkbox" id="engDuckduckgo" checked> DuckDuckGo</label>
                <label class="checkbox-item"><input type="checkbox" id="engYahoo"> Yahoo</label>
                <label class="checkbox-item"><input type="checkbox" id="engYandex"> Yandex</label>
                <label class="checkbox-item"><input type="checkbox" id="engBrave"> Brave</label>
                <label class="checkbox-item"><input type="checkbox" id="engWikipedia"> Wikipedia</label>
                <label class="checkbox-item"><input type="checkbox" id="engBaidu"> Baidu</label>
                <label class="checkbox-item"><input type="checkbox" id="engDirect"> Direct Sites</label>
            </div>
        </div>

        <div class="config-section">
            <h3>Filters</h3>
            <div class="form-group">
                <label>Include Terms (one per line)</label>
                <textarea id="cfgInclude" rows="2" placeholder="term1&#10;term2"></textarea>
            </div>
            <div class="form-group">
                <label>Exclude Terms (one per line)</label>
                <textarea id="cfgExclude" rows="2" placeholder="spam&#10;casino"></textarea>
            </div>
            <div class="form-group">
                <label>Forced Domains (one per line)</label>
                <textarea id="cfgForced" rows="2" placeholder="example.com"></textarea>
            </div>
            <div class="form-group">
                <label>URL Pattern (regex)</label>
                <input type="text" id="cfgUrlPattern" placeholder="\/article\/|\/post\/">
            </div>
        </div>

        <div class="config-section">
            <h3>Advanced</h3>
            <div class="form-group">
                <label>Links per Page</label>
                <input type="number" id="cfgLinksPerPage" value="150" min="10" max="500">
            </div>
            <div class="form-group">
                <label>Max Depth</label>
                <input type="number" id="cfgMaxDepth" value="5" min="1" max="20">
            </div>
            <div class="form-group">
                <label>Fetch Delay (ms)</label>
                <div class="slider-group">
                    <input type="range" id="cfgDelay" min="0" max="5000" step="100" value="0" oninput="document.getElementById('valDelay').textContent=this.value+'ms'">
                    <span class="value" id="valDelay">0ms</span>
                </div>
            </div>
            <div class="form-group">
                <label>HTTP Auth User</label>
                <input type="text" id="cfgHttpUser" placeholder="(optional)">
            </div>
            <div class="form-group">
                <label>HTTP Auth Pass</label>
                <input type="password" id="cfgHttpPass" placeholder="(optional)">
            </div>
            <div class="checkbox-grid">
                <label class="checkbox-item"><input type="checkbox" id="cfgFollowExternal" checked> Follow External</label>
                <label class="checkbox-item"><input type="checkbox" id="cfgPostsFirst" checked> Posts First</label>
                <label class="checkbox-item"><input type="checkbox" id="cfgPagination" checked> Pagination</label>
            </div>
        </div>

        <div class="launch-bar" style="grid-column:span 2">
            <button class="btn btn-primary" onclick="Pipeline.launch()" id="btnLaunch">Launch Pipeline</button>
            <button class="btn btn-sm btn-danger" onclick="Pipeline.cleanup()">Cleanup Old Data</button>
        </div>
    </div>

    <!-- DASHBOARD -->
    <div class="dashboard" id="dashboard">
        <div class="controls-bar">
            <button class="btn btn-sm btn-warning" onclick="Pipeline.control('pause')">Pause All</button>
            <button class="btn btn-sm btn-success" onclick="Pipeline.control('resume')">Resume All</button>
            <button class="btn btn-sm btn-danger" onclick="Pipeline.control('stop')">Stop All</button>
            <span style="margin:0 1rem;color:var(--text-secondary)">|</span>
            <button class="btn btn-sm" style="background:var(--bg-input);color:var(--text-primary)" onclick="Pipeline.showConfig()">Back to Config</button>
            <button class="btn btn-sm" style="background:var(--purple);color:white" onclick="Pipeline.exportResults('csv')">Export CSV</button>
            <button class="btn btn-sm" style="background:var(--cyan);color:white" onclick="Pipeline.exportResults('json')">Export JSON</button>
        </div>

        <div class="pipeline-viz">
            <div class="pipeline-col discovery">
                <h3><span style="font-size:1.2rem">&#x1F50D;</span> Discovery</h3>
                <span class="status-badge" id="discoveryStatus">idle</span>
                <div class="stat-grid" style="margin-top:0.8rem">
                    <div class="stat-item highlight"><div class="value" id="statDiscovered">0</div><div class="label">Discovered</div></div>
                    <div class="stat-item"><div class="value" id="statQueued">0</div><div class="label">Queued</div></div>
                    <div class="stat-item"><div class="value" id="statDiscWorkers">0</div><div class="label">Workers</div></div>
                    <div class="stat-item"><div class="value" id="statDiscRate">0</div><div class="label">URLs/s</div></div>
                </div>
            </div>

            <div class="pipeline-col queue queue-viz">
                <h3><span style="font-size:1.2rem">&#x1F4E6;</span> Queue</h3>
                <div class="queue-bar">
                    <div class="queue-fill" id="queueFill" style="height:0%"></div>
                </div>
                <div class="queue-count" id="queueCount">0</div>
                <div class="queue-label">pending</div>
            </div>

            <div class="pipeline-col processor">
                <h3><span style="font-size:1.2rem">&#x2699;</span> Processor</h3>
                <span class="status-badge" id="processorStatus">idle</span>
                <div class="stat-grid" style="margin-top:0.8rem">
                    <div class="stat-item highlight"><div class="value" id="statImported">0</div><div class="label">Imported</div></div>
                    <div class="stat-item"><div class="value" id="statProcessed">0</div><div class="label">Processed</div></div>
                    <div class="stat-item"><div class="value" id="statDuplicates">0</div><div class="label">Duplicates</div></div>
                    <div class="stat-item"><div class="value" id="statErrors">0</div><div class="label">Errors</div></div>
                    <div class="stat-item"><div class="value" id="statPrcWorkers">0</div><div class="label">Workers</div></div>
                    <div class="stat-item"><div class="value" id="statAvgMs">0</div><div class="label">Avg ms</div></div>
                </div>
            </div>
        </div>

        <div class="charts-row">
            <div class="chart-card">
                <h4>Rate Comparison (Discovery vs Import)</h4>
                <canvas id="chartRates"></canvas>
            </div>
            <div class="chart-card">
                <h4>Queue Depth Over Time</h4>
                <canvas id="chartQueue"></canvas>
            </div>
        </div>

        <div class="activity-log" id="activityLog">
            <h4>Activity Log</h4>
            <div id="logEntries"></div>
        </div>
    </div>
</div>

<script>
const Pipeline = {
    eventSource: null,
    rateChart: null,
    queueChart: null,
    rateData: { labels: [], discovery: [], import: [] },
    queueData: { labels: [], values: [] },
    maxDataPoints: 60,
    maxQueueDepth: 5000,

    launch() {
        const keywords = document.getElementById('cfgKeywords').value.trim();
        if (!keywords) { alert('Enter keywords or seed URLs'); return; }

        const config = {
            keywords: keywords,
            discovery_workers: +document.getElementById('cfgDiscovery').value,
            processor_workers: +document.getElementById('cfgProcessor').value,
            discovery_only: document.getElementById('cfgDiscoveryOnly').checked,
            processor_only: document.getElementById('cfgProcessorOnly').checked,
            pre_fetch: document.getElementById('cfgPreFetch').checked,
            wave_size: +document.getElementById('cfgWaveSize').value,
            max_queue_depth: +document.getElementById('cfgMaxQueue').value,
            max_discovered: +document.getElementById('cfgMaxDiscovered').value,
            batch_size: +document.getElementById('cfgBatchSize').value,
            quality_threshold: +document.getElementById('cfgQuality').value,
            relevance_threshold: +document.getElementById('cfgRelevance').value,
            max_imports: +document.getElementById('cfgMaxImports').value,
            links_per_page: +document.getElementById('cfgLinksPerPage').value,
            max_depth: +document.getElementById('cfgMaxDepth').value,
            fetch_delay: +document.getElementById('cfgDelay').value,
            follow_external: document.getElementById('cfgFollowExternal').checked,
            posts_first: document.getElementById('cfgPostsFirst').checked,
            enable_pagination: document.getElementById('cfgPagination').checked,
            url_pattern: document.getElementById('cfgUrlPattern').value,
            http_user: document.getElementById('cfgHttpUser').value,
            http_pass: document.getElementById('cfgHttpPass').value,
            include_terms: document.getElementById('cfgInclude').value,
            exclude_terms: document.getElementById('cfgExclude').value,
            forced_domains: document.getElementById('cfgForced').value,
            engine_searxng: document.getElementById('engSearxng').checked,
            engine_google: document.getElementById('engGoogle').checked,
            engine_bing: document.getElementById('engBing').checked,
            engine_bing_regional: document.getElementById('engBingRegional').checked,
            engine_duckduckgo: document.getElementById('engDuckduckgo').checked,
            engine_yahoo: document.getElementById('engYahoo').checked,
            engine_yandex: document.getElementById('engYandex').checked,
            engine_brave: document.getElementById('engBrave').checked,
            engine_wikipedia: document.getElementById('engWikipedia').checked,
            engine_baidu: document.getElementById('engBaidu').checked,
            engine_direct: document.getElementById('engDirect').checked
        };

        this.maxQueueDepth = config.max_queue_depth;
        document.getElementById('btnLaunch').disabled = true;
        document.getElementById('btnLaunch').textContent = 'Launching...';

        fetch('?action=create_pipeline', {
            method: 'POST',
            headers: {'Content-Type': 'application/json'},
            body: JSON.stringify(config)
        })
        .then(r => r.json())
        .then(data => {
            if (data.error) { alert('Error: ' + data.error); document.getElementById('btnLaunch').disabled = false; document.getElementById('btnLaunch').textContent = 'Launch Pipeline'; return; }
            this.showDashboard();
            this.startStreaming();
        })
        .catch(err => { alert('Launch failed: ' + err.message); document.getElementById('btnLaunch').disabled = false; document.getElementById('btnLaunch').textContent = 'Launch Pipeline'; });
    },

    startStreaming() {
        if (this.eventSource) this.eventSource.close();
        this.eventSource = new EventSource('?action=stream_pipeline');
        this.eventSource.onmessage = (e) => {
            try {
                const data = JSON.parse(e.data);
                this.updateDashboard(data);
                this.updateCharts(data);
                this.updateActivityLog(data.logs || []);
                if (data.finished) { this.eventSource.close(); this.eventSource = null; }
            } catch(err) {}
        };
        this.eventSource.onerror = () => { setTimeout(() => this.startStreaming(), 5000); };
    },

    updateDashboard(data) {
        const d = data.discovery || {};
        const p = data.processor || {};
        document.getElementById('statDiscovered').textContent = this.fmt(d.discovered || 0);
        document.getElementById('statQueued').textContent = this.fmt(d.queued || 0);
        document.getElementById('statDiscWorkers').textContent = d.workers || 0;
        document.getElementById('statDiscRate').textContent = (d.rate || 0).toFixed(1);

        document.getElementById('statImported').textContent = this.fmt(p.imported || 0);
        document.getElementById('statProcessed').textContent = this.fmt(p.processed || 0);
        document.getElementById('statDuplicates').textContent = this.fmt(p.duplicates || 0);
        document.getElementById('statErrors').textContent = this.fmt(p.errors || 0);
        document.getElementById('statPrcWorkers').textContent = p.workers || 0;
        document.getElementById('statAvgMs').textContent = p.avg_ms || 0;

        // Queue
        const qd = data.queue_depth || 0;
        document.getElementById('queueCount').textContent = this.fmt(qd);
        const fillPct = Math.min(100, (qd / this.maxQueueDepth) * 100);
        document.getElementById('queueFill').style.height = fillPct + '%';

        // Status badges
        this.setStatus('discoveryStatus', d.status || 'idle');
        this.setStatus('processorStatus', p.status || 'idle');
    },

    updateCharts(data) {
        const now = new Date().toLocaleTimeString().slice(0,5);
        const d = data.discovery || {};
        const p = data.processor || {};

        // Rate chart
        this.rateData.labels.push(now);
        this.rateData.discovery.push(d.rate || 0);
        this.rateData.import.push(p.processed > 0 ? ((p.imported || 0) / Math.max(1, (Date.now()/1000 - (data.timestamp || Date.now()/1000)))) : 0);
        if (this.rateData.labels.length > this.maxDataPoints) {
            this.rateData.labels.shift(); this.rateData.discovery.shift(); this.rateData.import.shift();
        }
        if (this.rateChart) this.rateChart.update('none');

        // Queue chart
        this.queueData.labels.push(now);
        this.queueData.values.push(data.queue_depth || 0);
        if (this.queueData.labels.length > this.maxDataPoints) {
            this.queueData.labels.shift(); this.queueData.values.shift();
        }
        if (this.queueChart) this.queueChart.update('none');
    },

    updateActivityLog(logs) {
        const container = document.getElementById('logEntries');
        if (!logs.length) return;
        let html = '';
        logs.forEach(log => {
            const status = log.status || 'imported';
            const time = (log.time || '').slice(11,16) || new Date().toLocaleTimeString().slice(0,5);
            html += '<div class="log-entry ' + status + '"><div class="status-dot"></div><span class="time">' + time + '</span><span class="domain">' + (log.domain||'').slice(0,25) + '</span><span class="title">' + (log.title||'').slice(0,80) + '</span></div>';
        });
        container.innerHTML = html;
    },

    initCharts() {
        const chartOpts = { responsive:true, maintainAspectRatio:true, animation:false, scales:{ x:{display:false}, y:{beginAtZero:true, grid:{color:'rgba(42,58,74,0.5)'}, ticks:{color:'#94a3b8',font:{size:10}}} }, plugins:{legend:{labels:{color:'#94a3b8',font:{size:10}}}} };

        this.rateChart = new Chart(document.getElementById('chartRates'), {
            type:'line',
            data:{ labels:this.rateData.labels, datasets:[
                {label:'Discovery Rate',data:this.rateData.discovery,borderColor:'#06b6d4',borderWidth:2,fill:false,pointRadius:0,tension:0.3},
                {label:'Import Rate',data:this.rateData.import,borderColor:'#10b981',borderWidth:2,fill:false,pointRadius:0,tension:0.3}
            ]},
            options:chartOpts
        });

        this.queueChart = new Chart(document.getElementById('chartQueue'), {
            type:'line',
            data:{ labels:this.queueData.labels, datasets:[
                {label:'Queue Depth',data:this.queueData.values,borderColor:'#f59e0b',backgroundColor:'rgba(245,158,11,0.1)',borderWidth:2,fill:true,pointRadius:0,tension:0.3}
            ]},
            options:chartOpts
        });
    },

    control(cmd) {
        fetch('?action=control_pipeline', {
            method:'POST',
            headers:{'Content-Type':'application/json'},
            body:JSON.stringify({command:cmd})
        }).then(r=>r.json()).then(d=>{
            if(d.error) alert(d.error);
        });
    },

    cleanup() {
        if (!confirm('Clean up all pipeline data?')) return;
        fetch('?action=cleanup_pipeline').then(r=>r.json()).then(d=>{ alert(d.message || d.error || 'Done'); });
    },

    exportResults(format) {
        // Collect visible stats
        const stats = {
            discovered: document.getElementById('statDiscovered').textContent,
            imported: document.getElementById('statImported').textContent,
            processed: document.getElementById('statProcessed').textContent,
            duplicates: document.getElementById('statDuplicates').textContent,
            errors: document.getElementById('statErrors').textContent,
            queue_depth: document.getElementById('queueCount').textContent
        };
        if (format === 'json') {
            const blob = new Blob([JSON.stringify(stats, null, 2)], {type:'application/json'});
            const url = URL.createObjectURL(blob);
            const a = document.createElement('a'); a.href = url; a.download = 'pipeline-stats.json'; a.click();
        } else {
            const csv = Object.keys(stats).join(',') + '\n' + Object.values(stats).join(',');
            const blob = new Blob([csv], {type:'text/csv'});
            const url = URL.createObjectURL(blob);
            const a = document.createElement('a'); a.href = url; a.download = 'pipeline-stats.csv'; a.click();
        }
    },

    showDashboard() {
        document.getElementById('configPanel').classList.add('hidden');
        document.getElementById('dashboard').classList.add('visible');
        if (!this.rateChart) this.initCharts();
    },

    showConfig() {
        if (this.eventSource) { this.eventSource.close(); this.eventSource = null; }
        document.getElementById('configPanel').classList.remove('hidden');
        document.getElementById('dashboard').classList.remove('visible');
        document.getElementById('btnLaunch').disabled = false;
        document.getElementById('btnLaunch').textContent = 'Launch Pipeline';
    },

    setStatus(elemId, status) {
        const el = document.getElementById(elemId);
        el.textContent = status;
        el.className = 'status-badge ' + status;
    },

    fmt(n) {
        if (n >= 1000000) return (n/1000000).toFixed(1)+'M';
        if (n >= 1000) return (n/1000).toFixed(1)+'K';
        return n.toString();
    }
};
</script>
</body>
</html>
