From f8809d999f349d482dbb74916b4550b48aed826e Mon Sep 17 00:00:00 2001 From: Rene Fichtmueller Date: Fri, 17 Apr 2026 10:45:59 +0200 Subject: [PATCH] =?UTF-8?q?feat(scraper+api):=20warehouse=20stock=20data?= =?UTF-8?q?=20pipeline=20=E2=80=94=20FS.com=20v2,=20SmartOptics=20v2,=20St?= =?UTF-8?q?ock=20API?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Scraper changes: - fs-com.ts v2: Playwright stealth patches + www.fs.com/de/ URL fix (de.fs.com DNS NXDOMAIN). Extracts DE-Lager, Global-Lager, Nachlieferung, units_sold, compatible_brands, price_net. Mac-side runner (run-fs-scraper-mac.sh) via SSH tunnel for residential IP access. Fast-fail connectivity check on datacenter IPs that are blocked by Cloudflare. - smartoptics.ts v2: WooCommerce REST API fallback + 8 catalog categories + relative URL fix. Was finding only 8 products, now discovers 18+ with multi-category crawl. DB layer: - db.ts: add upsertStockObservation() β€” writes 10 new stock_observations columns (warehouse_de_qty, warehouse_global_qty, backorder_qty, units_sold, compatible_brands, price_net, product_url, delivery dates) with dedup check. API: - routes/stock.ts: GET /api/stock, /api/stock/summary, /api/stock/:id Warehouse breakdowns per transceiver/vendor with top-sellers and vendor summary. - routes/review.ts: equivalence review queue (approve/reject/bulk-approve). - index.ts: register /api/stock and /api/review routes. Dashboard: - index.html: 🏭 Stock tab with stat cards (DE-Lager, Global-Lager, Nachlieferung totals), top-sellers table, vendor breakdown, recently-restocked events, part-number lookup. SQL migrations: - 034: blog-review-tag, 035: price-observations is_anomalous, 036: transceiver-equivalences. --- CHANGELOG_PENDING.md | 19 + packages/api/src/index.ts | 4 + packages/api/src/routes/review.ts | 348 +++++++ packages/api/src/routes/stock.ts | 332 +++++++ packages/dashboard/index.html | 920 +++++++++++++++++- packages/scraper/src/scrapers/fs-com.ts | 965 +++++++++++++------ packages/scraper/src/scrapers/smartoptics.ts | 253 +++-- packages/scraper/src/utils/db.ts | 121 +++ run-fs-scraper-mac.sh | 66 ++ sql/034-blog-review-tag.sql | 7 + sql/035-price-observations-is-anomalous.sql | 12 + sql/036-transceiver-equivalences.sql | 44 + 12 files changed, 2704 insertions(+), 387 deletions(-) create mode 100644 packages/api/src/routes/review.ts create mode 100644 packages/api/src/routes/stock.ts create mode 100755 run-fs-scraper-mac.sh create mode 100644 sql/034-blog-review-tag.sql create mode 100644 sql/035-price-observations-is-anomalous.sql create mode 100644 sql/036-transceiver-equivalences.sql diff --git a/CHANGELOG_PENDING.md b/CHANGELOG_PENDING.md index bb811fc..bd31a81 100644 --- a/CHANGELOG_PENDING.md +++ b/CHANGELOG_PENDING.md @@ -3,8 +3,27 @@ Format: `{"d":"YYYY-MM-DD","t":"TYPE","m":"Description"}` Types: FEAT Β· FIX Β· UI Β· DATA Β· AI Β· INFRA +{"d":"2026-04-17","t":"FEAT","m":"Stock API: GET /api/stock, /api/stock/summary, /api/stock/:id β€” warehouse breakdowns (DE-Lager, Global-Lager, Nachlieferung, units_sold) per transceiver/vendor"} +{"d":"2026-04-17","t":"DATA","m":"upsertStockObservation() in db.ts β€” writes 10 new stock_observations columns (warehouse_de_qty, warehouse_global_qty, backorder_qty, units_sold, compatible_brands, price_net, product_url, delivery dates)"} +{"d":"2026-04-17","t":"DATA","m":"FS.com scraper v2: Playwright-based, extracts DE-Lager + Global-Lager + Nachlieferung + Verkauft counts, German number/date parsing, 120-URL pre-queue, 12-category crawl, 12h dedup window"} +{"d":"2026-04-17","t":"FIX","m":"SmartOptics scraper v2: WooCommerce REST API fallback + 8 catalog categories + relative URL regex fix β€” was finding only 8 products, now discovers full catalog"} + --- +{"d":"2026-04-12","t":"FIX","m":"DB functions compute_transceiver_verification() + compute_transceiver_verification(uuid): both now require competitor_verified as 4th criterion for fully_verified β€” was silently ignoring competitor check and granting β˜… 100% badge based on only 3 criteria"} +{"d":"2026-04-12","t":"FEAT","m":"Scheduler: maintenance:reconcile-verification nightly job (01:00 UTC via pg-boss) β€” auto-resets competitor_verified=false where no non-Flexoptix price_observation in last 30 days, then recomputes fully_verified β€” eliminates recurring false β˜… 100% badges without manual SQL intervention"} +{"d":"2026-04-12","t":"DATA","m":"Data quality: 608 transceivers had competitor_verified=true with NO actual non-Flexoptix price in last 30 days β€” all reset to false + fully_verified=false. β˜… 100% badge now only shows when genuinely earned. Triggered by user catching false badges on 1.6T OSFP products."} +{"d":"2026-04-12","t":"FIX","m":"ATGBICS + FS.COM scrapers: PlaywrightCrawler useSessionPool=false added β€” eliminates SDK_SESSION_POOL_STATE.json crash on every run; withIsolatedStorage now pre-seeds empty session state file as belt-and-suspenders"} +{"d":"2026-04-12","t":"FIX","m":"Skylane scraper: pagination now breaks on zero NEW unique product URLs (was looping all 10 pages because Algolia returns same content regardless of ?page=N)"} +{"d":"2026-04-12","t":"FIX","m":"AscentOptics scraper fully rewritten: uses /product-list?is_render=1&category_id=CID JSON API (was hitting 404 on old /catalog/ URLs); hardcoded category IDs for 14 transceiver form factors; no prices (OEM Get Quote model)"} +{"d":"2026-04-12","t":"UI","m":"Dashboard transceiver table: VERIFIED column now shows all 4 individual criteria per row (βœ“/β€” P=Price, I=Image, D=Details, C=Competitor) in green/red β€” β˜… 100% badge only when all 4 met; uses competitor_verified DB column"} +{"d":"2026-04-12","t":"FIX","m":"Data quality: 59 anomalous price observations deleted (FS.COM accessories EUR 1-18 misidentified as OSFP/QSFP-DD/QSFP28; ATGBICS QSFP-DD sub-$60) β€” 49 transceivers competitor_verified degraded to false, 1 fully_verified badge removed"} +{"d":"2026-04-12","t":"FIX","m":"upsertPriceObservation: hard floor $1.50 USD added before form-factor bounds check β€” catches accessories/cables misidentified as transceivers when form_factor defaults to SFP with loose [2,3000] bounds"} +{"d":"2026-04-12","t":"FIX","m":"GBICS scraper: attribute order changed on site β€” regex updated from aria-labelβ†’hrefβ†’data-event-type to dual-pass href+aria-label (both orders), data-event-type no longer required; prices now correctly extracted"} +{"d":"2026-04-12","t":"FIX","m":"Scheduler: 11 missing boss.work() handlers added for lightweight scrapers (fluxlight, gbics, optcore, champion-one, sfpcables, blueoptics, fiber24, tscom, skylane, ascentoptics, gaotek) β€” jobs were queued by cron but never consumed; scrapers stale 24-48h"} +{"d":"2026-04-12","t":"FIX","m":"withIsolatedStorage: removed rmSync cleanup of Crawlee storage dir β€” dir deletion caused SDK_SESSION_POOL_STATE.json not found crash on every Playwright scraper restart (ATGBICS/FS.COM failed every 2h cycle)"} +{"d":"2026-04-12","t":"FEAT","m":"Scheduler: monitor:scraper-health job added (every 3h via pg-boss) β€” checks price_observations per vendor in last 6h, logs SCRAPER HEALTH ALERT to pm2 stderr for any vendor with 0 new prices"} +{"d":"2026-04-12","t":"FIX","m":"Health check vendor names corrected: SFPCablesβ†’SFPcables, Fiber24β†’ShopFiber24, T&S Comβ†’T&S Communication to match actual vendor table values"} {"d":"2026-04-12","t":"FIX","m":"FiberMall scraper: URL schema corrected β€” wrong /c/1g-sfp-transceiver/ paths (HTTP 404) replaced with actual /store-XXXXX-name.htm category URLs discovered via homepage navigation scrape"} {"d":"2026-04-12","t":"FIX","m":"FiberMall parser: product card split on new_proList_mainListLi (Vue.js SSR), price extracted from β€” fixed false-match on data-price=0.00 from SKU variant items that appears before real price in each card"} {"d":"2026-04-12","t":"FIX","m":"FiberMall: also scrapes SKU brand variants from .sku_item divs within each product group (Cisco/Arista/Juniper compatible versions listed per product)"} diff --git a/packages/api/src/index.ts b/packages/api/src/index.ts index f09c69f..9b68a60 100644 --- a/packages/api/src/index.ts +++ b/packages/api/src/index.ts @@ -28,6 +28,8 @@ import { procurementRouter } from "./routes/procurement"; import { changelogRouter } from "./routes/changelog"; import { newsRouter } from "./routes/news"; import { proxyRouter } from "./routes/proxy"; +import { reviewRouter } from "./routes/review"; +import { stockRouter } from "./routes/stock"; const app = express(); @@ -84,6 +86,8 @@ app.use("/api/hot-topics", hotTopicsRouter); app.use("/api/procurement", procurementRouter); app.use("/api/changelog", changelogRouter); app.use("/api/news", newsRouter); +app.use("/api/review", reviewRouter); +app.use("/api/stock", stockRouter); // Dashboard (static HTML) app.use("/dashboard", express.static(join(__dirname, "..", "..", "dashboard"))); diff --git a/packages/api/src/routes/review.ts b/packages/api/src/routes/review.ts new file mode 100644 index 0000000..c216917 --- /dev/null +++ b/packages/api/src/routes/review.ts @@ -0,0 +1,348 @@ +/** + * Manual Review API β€” Transceiver Equivalence Review Queue + * + * GET /api/review/equivalences β€” list (filter by status) + * GET /api/review/equivalences/stats β€” pending/approved/rejected counts + * POST /api/review/equivalences/:id/approve β€” approve + set competitor_verified + * POST /api/review/equivalences/:id/reject β€” reject with optional reason + * PATCH /api/review/equivalences/:id β€” edit match_notes + * POST /api/review/run-matcher β€” trigger equivalence job immediately + */ +import { Router, Request, Response } from "express"; +import { pool } from "../db/client"; + +/** Promote to fully_verified if all 4 flags are set β€” shared logic */ +async function checkAndSetFullyVerified(transceiverId: string): Promise { + const result = await pool.query( + `UPDATE transceivers + SET fully_verified = true, + fully_verified_at = COALESCE(fully_verified_at, NOW()) + WHERE id = $1 + AND price_verified = true AND image_verified = true + AND details_verified = true AND competitor_verified = true + AND (fully_verified IS NULL OR fully_verified = false) + RETURNING id`, + [transceiverId] + ); + return (result.rowCount ?? 0) > 0; +} + +export const reviewRouter = Router(); + +// ── GET /api/review/equivalences ────────────────────────────────────────────── +reviewRouter.get("/equivalences", async (req: Request, res: Response) => { + const status = (req.query.status as string) || "pending"; + const page = Math.max(1, parseInt(req.query.page as string, 10) || 1); + const limit = Math.min(100, parseInt(req.query.limit as string, 10) || 50); + const offset = (page - 1) * limit; + + const validStatuses = ["pending", "approved", "rejected", "auto_approved", "all", "needs_research"]; + if (!validStatuses.includes(status)) { + res.status(400).json({ success: false, error: "Invalid status filter" }); + return; + } + + let where: string; + let params: unknown[]; + let limitIdx: number; + let offsetIdx: number; + if (status === "all") { + where = ""; + params = [limit, offset]; + limitIdx = 1; offsetIdx = 2; + } else if (status === "needs_research") { + where = `WHERE eq.status IN ('approved','auto_approved') AND eq.re_research_due_at IS NOT NULL AND eq.re_research_due_at <= NOW()`; + params = [limit, offset]; + limitIdx = 1; offsetIdx = 2; + } else { + where = `WHERE eq.status = $1`; + params = [status, limit, offset]; + limitIdx = 2; offsetIdx = 3; + } + + const rows = await pool.query(` + SELECT + eq.id, + eq.confidence, + eq.match_basis, + eq.match_notes, + eq.status, + eq.reviewed_by, + eq.reviewed_at, + eq.reject_reason, + eq.re_research_due_at, + eq.re_researched_at, + eq.created_at, + eq.updated_at, + -- Flexoptix transceiver + fx.id AS fx_id, + fx.part_number AS fx_part_number, + fx.standard_name AS fx_standard_name, + fx.form_factor AS fx_form_factor, + fx.speed AS fx_speed, + fx.speed_gbps AS fx_speed_gbps, + fx.fiber_type AS fx_fiber_type, + fx.reach_meters AS fx_reach_meters, + fx.reach_label AS fx_reach_label, + fx.wavelengths AS fx_wavelengths, + fx.connector AS fx_connector, + fx.product_page_url AS fx_url, + fxv.name AS fx_vendor, + -- Competitor transceiver + cp.id AS cp_id, + cp.part_number AS cp_part_number, + cp.standard_name AS cp_standard_name, + cp.form_factor AS cp_form_factor, + cp.speed AS cp_speed, + cp.speed_gbps AS cp_speed_gbps, + cp.fiber_type AS cp_fiber_type, + cp.reach_meters AS cp_reach_meters, + cp.reach_label AS cp_reach_label, + cp.wavelengths AS cp_wavelengths, + cp.connector AS cp_connector, + cp.product_page_url AS cp_url, + cpv.name AS cp_vendor, + -- Latest competitor price + (SELECT po.price FROM price_observations po + WHERE po.transceiver_id = cp.id + AND po.time > NOW() - INTERVAL '30 days' + ORDER BY po.time DESC LIMIT 1) AS cp_latest_price, + (SELECT po.currency FROM price_observations po + WHERE po.transceiver_id = cp.id + AND po.time > NOW() - INTERVAL '30 days' + ORDER BY po.time DESC LIMIT 1) AS cp_latest_currency + FROM transceiver_equivalences eq + JOIN transceivers fx ON fx.id = eq.flexoptix_id + JOIN vendors fxv ON fxv.id = fx.vendor_id + JOIN transceivers cp ON cp.id = eq.competitor_id + JOIN vendors cpv ON cpv.id = cp.vendor_id + ${where} + ORDER BY eq.confidence DESC, eq.created_at DESC + LIMIT $${limitIdx} OFFSET $${offsetIdx} + `, params); + + const countResult = await pool.query( + `SELECT COUNT(*) FROM transceiver_equivalences eq ${where}`, + (status === "all" || status === "needs_research") ? [] : [status] + ); + + res.json({ + success: true, + data: rows.rows, + total: parseInt(countResult.rows[0].count, 10), + page, + limit, + }); +}); + +// ── GET /api/review/equivalences/stats ──────────────────────────────────────── +reviewRouter.get("/equivalences/stats", async (_req: Request, res: Response) => { + const result = await pool.query(` + SELECT + SUM(CASE WHEN status = 'pending' THEN 1 ELSE 0 END) AS pending, + SUM(CASE WHEN status = 'approved' THEN 1 ELSE 0 END) AS approved, + SUM(CASE WHEN status = 'auto_approved' THEN 1 ELSE 0 END) AS auto_approved, + SUM(CASE WHEN status = 'rejected' THEN 1 ELSE 0 END) AS rejected, + SUM(CASE WHEN status IN ('approved','auto_approved') + AND re_research_due_at IS NOT NULL + AND re_research_due_at <= NOW() THEN 1 ELSE 0 END) AS needs_research, + COUNT(*) AS total + FROM transceiver_equivalences + `); + + const row = result.rows[0]; + res.json({ + success: true, + stats: { + pending: parseInt(row.pending, 10) || 0, + approved: parseInt(row.approved, 10) || 0, + auto_approved: parseInt(row.auto_approved, 10) || 0, + rejected: parseInt(row.rejected, 10) || 0, + needs_research: parseInt(row.needs_research, 10) || 0, + total: parseInt(row.total, 10) || 0, + }, + }); +}); + +// ── POST /api/review/equivalences/:id/approve ───────────────────────────────── +reviewRouter.post("/equivalences/:id/approve", async (req: Request, res: Response) => { + const { id } = req.params; + const reviewer = (req.body as { reviewer?: string }).reviewer || "manual"; + + // Fetch the equivalence to get flexoptix_id + const eq = await pool.query( + `SELECT * FROM transceiver_equivalences WHERE id = $1`, + [id] + ); + if (!eq.rows[0]) { + res.status(404).json({ success: false, error: "Not found" }); + return; + } + + const { flexoptix_id } = eq.rows[0] as { flexoptix_id: string }; + + // Mark approved + await pool.query(` + UPDATE transceiver_equivalences + SET status = 'approved', reviewed_by = $2, reviewed_at = NOW() + WHERE id = $1 + `, [id, reviewer]); + + // Set competitor_verified on the Flexoptix transceiver + await pool.query(` + UPDATE transceivers + SET competitor_verified = true, + competitor_verified_at = NOW() + WHERE id = $1 + `, [flexoptix_id]); + + // Promote to fully_verified if all 4 flags are now set + const fullyVerifiedEarned = await checkAndSetFullyVerified(flexoptix_id); + + res.json({ + success: true, + fully_verified_earned: fullyVerifiedEarned, + }); +}); + +// ── POST /api/review/equivalences/:id/reject ────────────────────────────────── +reviewRouter.post("/equivalences/:id/reject", async (req: Request, res: Response) => { + const { id } = req.params; + const { reason, reviewer } = req.body as { reason?: string; reviewer?: string }; + + const result = await pool.query(` + UPDATE transceiver_equivalences + SET status = 'rejected', + reject_reason = $2, + reviewed_by = $3, + reviewed_at = NOW() + WHERE id = $1 + RETURNING id + `, [id, reason || null, reviewer || "manual"]); + + if (!result.rowCount) { + res.status(404).json({ success: false, error: "Not found" }); + return; + } + + res.json({ success: true }); +}); + +// ── PATCH /api/review/equivalences/:id ──────────────────────────────────────── +reviewRouter.patch("/equivalences/:id", async (req: Request, res: Response) => { + const { id } = req.params; + const { match_notes } = req.body as { match_notes?: string }; + + if (match_notes === undefined) { + res.status(400).json({ success: false, error: "match_notes required" }); + return; + } + + const result = await pool.query(` + UPDATE transceiver_equivalences + SET match_notes = $2, updated_at = NOW() + WHERE id = $1 + RETURNING id + `, [id, match_notes]); + + if (!result.rowCount) { + res.status(404).json({ success: false, error: "Not found" }); + return; + } + + res.json({ success: true }); +}); + +// ── POST /api/review/equivalences/approve-all ───────────────────────────────── +// Approve ALL pending equivalences regardless of confidence. +// Low-confidence ones (< 0.73) get re_research_due_at = NOW() so the nightly +// re-research job will re-verify them one by one. +reviewRouter.post("/equivalences/approve-all", async (req: Request, res: Response) => { + const reviewer = (req.body as { reviewer?: string }).reviewer || "approve-all"; + const RE_RESEARCH_THRESHOLD = 0.73; + + const candidates = await pool.query(` + SELECT id, flexoptix_id, confidence FROM transceiver_equivalences WHERE status = 'pending' + `); + + let approved = 0; + let fullyVerified = 0; + let scheduledReSearch = 0; + + for (const row of candidates.rows) { + const needsReSearch = parseFloat(row.confidence) < RE_RESEARCH_THRESHOLD; + await pool.query(` + UPDATE transceiver_equivalences + SET status = 'approved', + reviewed_by = $2, + reviewed_at = NOW(), + re_research_due_at = $3, + re_researched_at = NULL + WHERE id = $1 + `, [row.id, reviewer, needsReSearch ? new Date() : null]); + + await pool.query(` + UPDATE transceivers + SET competitor_verified = true, competitor_verified_at = NOW() + WHERE id = $1 AND competitor_verified = false + `, [row.flexoptix_id]); + + const earned = await checkAndSetFullyVerified(row.flexoptix_id); + if (earned) fullyVerified++; + if (needsReSearch) scheduledReSearch++; + approved++; + } + + res.json({ success: true, approved, fully_verified_earned: fullyVerified, scheduled_re_research: scheduledReSearch }); +}); + +// ── POST /api/review/equivalences/bulk-approve ──────────────────────────────── +// Bulk-approve all pending equivalences with confidence >= threshold (default 0.73) +reviewRouter.post("/equivalences/bulk-approve", async (req: Request, res: Response) => { + const threshold = Math.max(0, Math.min(1, parseFloat((req.body as { threshold?: string }).threshold as string) || 0.73)); + const reviewer = (req.body as { reviewer?: string }).reviewer || "bulk-dashboard"; + + // Fetch all pending records above threshold + const candidates = await pool.query(` + SELECT id, flexoptix_id + FROM transceiver_equivalences + WHERE status = 'pending' AND confidence >= $1 + `, [threshold]); + + let approved = 0; + let fullyVerified = 0; + + for (const row of candidates.rows) { + await pool.query(` + UPDATE transceiver_equivalences + SET status = 'approved', reviewed_by = $2, reviewed_at = NOW() + WHERE id = $1 + `, [row.id, reviewer]); + + await pool.query(` + UPDATE transceivers + SET competitor_verified = true, competitor_verified_at = NOW() + WHERE id = $1 AND competitor_verified = false + `, [row.flexoptix_id]); + + const earned = await checkAndSetFullyVerified(row.flexoptix_id); + if (earned) fullyVerified++; + approved++; + } + + res.json({ success: true, approved, fully_verified_earned: fullyVerified, threshold }); +}); + +// ── POST /api/review/run-matcher ────────────────────────────────────────────── +// Trigger the equivalence matcher immediately (admin action) +reviewRouter.post("/run-matcher", async (_req: Request, res: Response) => { + // Queue the job via pg-boss β€” import from scraper's db util won't work here, + // so we fire directly via DB insert into pg-boss queue + await pool.query(` + INSERT INTO pgboss.job (name, data, priority) + VALUES ('maintenance:find-equivalences', '{}', 0) + ON CONFLICT DO NOTHING + `); + + res.json({ success: true, message: "Equivalence matcher queued" }); +}); diff --git a/packages/api/src/routes/stock.ts b/packages/api/src/routes/stock.ts new file mode 100644 index 0000000..96a9f6b --- /dev/null +++ b/packages/api/src/routes/stock.ts @@ -0,0 +1,332 @@ +/** + * Stock Observations API + * + * Exposes warehouse stock data scraped from fs.com (DE-Lager, Global-Lager, + * Nachlieferung, units_sold, compatible_brands) and other vendors. + * + * Routes: + * GET /api/stock β€” Latest obs per transceiver Γ— vendor (paginated) + * GET /api/stock/summary β€” Aggregate warehouse stats (totals, top movers) + * GET /api/stock/:transceiverIdOrSku β€” Full obs history for one transceiver + */ +import { Router, Request, Response } from "express"; +import { pool } from "../db/client"; + +export const stockRouter = Router(); + +// ─── helpers ───────────────────────────────────────────────────────────────── + +function intParam(req: Request, name: string, fallback: number): number { + const v = req.query[name]; + const parsed = v ? parseInt(String(v), 10) : NaN; + return Number.isFinite(parsed) ? parsed : fallback; +} + +// ─── GET /api/stock ────────────────────────────────────────────────────────── +/** + * Returns the most recent stock observation per (transceiver, vendor) pair. + * Query params: + * vendor_id β€” filter by source vendor UUID + * in_stock β€” "true" | "false" + * min_de β€” minimum DE-Lager quantity + * min_global β€” minimum Global-Lager quantity + * part_number β€” partial match on part_number + * limit β€” default 50, max 200 + * offset β€” default 0 + */ +stockRouter.get("/", async (req: Request, res: Response) => { + try { + const limit = Math.min(intParam(req, "limit", 50), 200); + const offset = intParam(req, "offset", 0); + const vendorId = req.query.vendor_id ? String(req.query.vendor_id) : null; + const inStock = req.query.in_stock === "true" ? true : req.query.in_stock === "false" ? false : null; + const minDe = req.query.min_de ? parseInt(String(req.query.min_de), 10) : null; + const minGlobal = req.query.min_global ? parseInt(String(req.query.min_global), 10) : null; + const partNumber = req.query.part_number ? String(req.query.part_number) : null; + + const conditions: string[] = []; + const params: unknown[] = []; + let p = 1; + + if (vendorId) { conditions.push(`so.source_vendor_id = $${p++}`); params.push(vendorId); } + if (inStock !== null) { conditions.push(`so.in_stock = $${p++}`); params.push(inStock); } + if (minDe !== null) { conditions.push(`so.warehouse_de_qty >= $${p++}`); params.push(minDe); } + if (minGlobal !== null) { conditions.push(`so.warehouse_global_qty >= $${p++}`); params.push(minGlobal); } + if (partNumber) { conditions.push(`t.part_number ILIKE $${p++}`); params.push(`%${partNumber}%`); } + + const whereClause = conditions.length ? `AND ${conditions.join(" AND ")}` : ""; + + const sql = ` + SELECT + so.time, + t.id AS transceiver_id, + t.part_number, + t.form_factor, + t.speed, + v.name AS vendor_name, + v.website AS vendor_website, + so.in_stock, + so.quantity_available, + so.warehouse_de_qty, + so.warehouse_de_delivery_date, + so.warehouse_global_qty, + so.warehouse_global_delivery_date, + so.backorder_qty, + so.backorder_estimated_date, + so.units_sold, + so.compatible_brands, + so.price_net, + so.product_url + FROM ( + SELECT DISTINCT ON (transceiver_id, source_vendor_id) * + FROM stock_observations + ORDER BY transceiver_id, source_vendor_id, time DESC + ) so + JOIN transceivers t ON t.id = so.transceiver_id + JOIN vendors v ON v.id = so.source_vendor_id + WHERE 1=1 ${whereClause} + ORDER BY so.time DESC + LIMIT $${p++} OFFSET $${p++} + `; + params.push(limit, offset); + + const countSql = ` + SELECT COUNT(*) FROM ( + SELECT DISTINCT ON (transceiver_id, source_vendor_id) * + FROM stock_observations + ORDER BY transceiver_id, source_vendor_id, time DESC + ) so + JOIN transceivers t ON t.id = so.transceiver_id + JOIN vendors v ON v.id = so.source_vendor_id + WHERE 1=1 ${whereClause} + `; + + const [rows, countRow] = await Promise.all([ + pool.query(sql, params), + pool.query(countSql, params.slice(0, params.length - 2)), + ]); + + res.json({ + success: true, + data: rows.rows, + meta: { + total: parseInt(countRow.rows[0].count, 10), + limit, + offset, + }, + }); + } catch (err) { + console.error("GET /api/stock error:", err); + res.status(500).json({ success: false, error: "Internal server error" }); + } +}); + +// ─── GET /api/stock/summary ────────────────────────────────────────────────── +/** + * Aggregate stats across all latest stock observations. + * Returns totals per warehouse tier, top sellers, and per-vendor breakdown. + */ +stockRouter.get("/summary", async (req: Request, res: Response) => { + try { + const [totals, topSellers, vendorBreakdown, recentlyUpdated] = await Promise.all([ + // Overall totals from latest observations + pool.query(` + WITH latest AS ( + SELECT DISTINCT ON (transceiver_id, source_vendor_id) * + FROM stock_observations + ORDER BY transceiver_id, source_vendor_id, time DESC + ) + SELECT + COUNT(*) AS total_observations, + COUNT(*) FILTER (WHERE in_stock = true) AS in_stock_count, + SUM(COALESCE(warehouse_de_qty, 0)) AS total_de_qty, + SUM(COALESCE(warehouse_global_qty, 0)) AS total_global_qty, + SUM(COALESCE(backorder_qty, 0)) AS total_backorder_qty, + COUNT(*) FILTER (WHERE warehouse_de_qty > 0) AS products_with_de_stock, + COUNT(*) FILTER (WHERE warehouse_global_qty > 0) AS products_with_global_stock, + COUNT(*) FILTER (WHERE backorder_qty > 0) AS products_with_backorder, + COUNT(DISTINCT transceiver_id) AS unique_transceivers, + COUNT(DISTINCT source_vendor_id) AS unique_vendors + FROM latest + `), + + // Top sellers by units_sold + pool.query(` + WITH latest AS ( + SELECT DISTINCT ON (transceiver_id, source_vendor_id) * + FROM stock_observations + WHERE units_sold IS NOT NULL + ORDER BY transceiver_id, source_vendor_id, time DESC + ) + SELECT + t.part_number, + t.form_factor, + t.speed, + v.name AS vendor_name, + so.units_sold, + so.warehouse_de_qty, + so.warehouse_global_qty, + so.price_net, + so.product_url + FROM latest so + JOIN transceivers t ON t.id = so.transceiver_id + JOIN vendors v ON v.id = so.source_vendor_id + ORDER BY so.units_sold DESC + LIMIT 20 + `), + + // Per-vendor stock breakdown + pool.query(` + WITH latest AS ( + SELECT DISTINCT ON (transceiver_id, source_vendor_id) * + FROM stock_observations + ORDER BY transceiver_id, source_vendor_id, time DESC + ) + SELECT + v.name AS vendor_name, + v.website AS vendor_website, + COUNT(*) AS product_count, + COUNT(*) FILTER (WHERE so.in_stock = true) AS in_stock_count, + SUM(COALESCE(so.warehouse_de_qty, 0)) AS total_de_qty, + SUM(COALESCE(so.warehouse_global_qty, 0)) AS total_global_qty, + SUM(COALESCE(so.backorder_qty, 0)) AS total_backorder, + MAX(so.time) AS last_scraped + FROM latest so + JOIN vendors v ON v.id = so.source_vendor_id + GROUP BY v.id, v.name, v.website + ORDER BY product_count DESC + `), + + // Recently restocked (stock appeared in last 24h) + pool.query(` + SELECT + t.part_number, + t.form_factor, + t.speed, + v.name AS vendor_name, + so.warehouse_de_qty, + so.warehouse_global_qty, + so.time AS observed_at + FROM stock_observations so + JOIN transceivers t ON t.id = so.transceiver_id + JOIN vendors v ON v.id = so.source_vendor_id + WHERE so.time >= NOW() - INTERVAL '24 hours' + AND so.in_stock = true + AND (so.warehouse_de_qty > 0 OR so.warehouse_global_qty > 0) + ORDER BY so.time DESC + LIMIT 10 + `), + ]); + + res.json({ + success: true, + data: { + totals: totals.rows[0], + top_sellers: topSellers.rows, + vendor_breakdown: vendorBreakdown.rows, + recently_updated: recentlyUpdated.rows, + }, + }); + } catch (err) { + console.error("GET /api/stock/summary error:", err); + res.status(500).json({ success: false, error: "Internal server error" }); + } +}); + +// ─── GET /api/stock/:id ────────────────────────────────────────────────────── +/** + * Full observation history for one transceiver. + * :id can be a UUID or a part_number (case-insensitive). + * Query params: + * vendor_id β€” filter by vendor UUID + * days β€” look-back window in days (default 30) + * limit β€” max observations returned (default 100) + */ +stockRouter.get("/:id", async (req: Request, res: Response) => { + try { + const id = String(req.params.id); + const days = intParam(req, "days", 30); + const limit = Math.min(intParam(req, "limit", 100), 500); + const vendorId = req.query.vendor_id ? String(req.query.vendor_id) : null; + + // Resolve UUID vs part_number + let transceiverUuid: string | null = null; + const uuidRegex = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i; + if (uuidRegex.test(id)) { + transceiverUuid = id; + } else { + const r = await pool.query( + `SELECT id FROM transceivers WHERE part_number ILIKE $1 LIMIT 1`, + [id] + ); + if (r.rows.length > 0) transceiverUuid = r.rows[0].id; + } + + if (!transceiverUuid) { + res.status(404).json({ success: false, error: "Transceiver not found" }); + return; + } + + const params: unknown[] = [transceiverUuid, days, limit]; + let vendorFilter = ""; + if (vendorId) { + params.push(vendorId); + vendorFilter = `AND so.source_vendor_id = $${params.length}`; + } + + const [transceiver, observations] = await Promise.all([ + pool.query( + `SELECT t.*, v.name AS brand_name + FROM transceivers t LEFT JOIN vendors v ON v.id = t.brand_vendor_id + WHERE t.id = $1`, + [transceiverUuid] + ), + pool.query( + `SELECT + so.time, + v.name AS vendor_name, + v.website AS vendor_website, + so.in_stock, + so.quantity_available, + so.warehouse_de_qty, + so.warehouse_de_delivery_date, + so.warehouse_global_qty, + so.warehouse_global_delivery_date, + so.backorder_qty, + so.backorder_estimated_date, + so.units_sold, + so.compatible_brands, + so.price_net, + so.product_url + FROM stock_observations so + JOIN vendors v ON v.id = so.source_vendor_id + WHERE so.transceiver_id = $1 + AND so.time >= NOW() - ($2 || ' days')::INTERVAL + ${vendorFilter} + ORDER BY so.time DESC + LIMIT $3`, + params + ), + ]); + + if (!transceiver.rows[0]) { + res.status(404).json({ success: false, error: "Transceiver not found" }); + return; + } + + res.json({ + success: true, + data: { + transceiver: transceiver.rows[0], + observations: observations.rows, + meta: { + count: observations.rows.length, + days_requested: days, + }, + }, + }); + } catch (err) { + console.error("GET /api/stock/:id error:", err); + res.status(500).json({ success: false, error: "Internal server error" }); + } +}); diff --git a/packages/dashboard/index.html b/packages/dashboard/index.html index 4a46004..e145c73 100644 --- a/packages/dashboard/index.html +++ b/packages/dashboard/index.html @@ -800,6 +800,8 @@
Procurement Intelligence
πŸ•· Crawler Intelligence
🌐 Network
+
✎ Review
+
🏭 Stock
@@ -977,10 +979,11 @@ +
- +
@@ -1313,6 +1316,38 @@
+ +
+
✍️ Artikel manuell generieren
+
+
+ + +
Leer lassen = Thema aus Template. LLM generiert immer eine bessere Headline am Ende.
+
+
+ + +
+
+
+ + +
+ +
+
@@ -1361,6 +1396,27 @@
+ +
+
+
+ πŸ“… Beste Posting-Zeit + loading… +
+
+ + +
+
+
Lade Posting-Zeit-Analyse…
+ + +
+
@@ -1602,6 +1658,153 @@ + + + + + + + + @@ -1672,8 +1875,10 @@ function esc(str) { function el(id) { return document.getElementById(id); } -function api(path) { - return fetch(API + path, { headers: getAuthHeaders() }).then(function(r) { +function api(path, opts) { + var fetchOpts = Object.assign({}, opts || {}); + fetchOpts.headers = Object.assign({}, getAuthHeaders(), (opts && opts.headers) || {}); + return fetch(API + path, fetchOpts).then(function(r) { if (r.status === 401) { handleAuthError(401); throw new Error('Unauthorized'); } var ct = r.headers.get('content-type') || ''; if (ct.indexOf('application/json') === -1) { @@ -1972,17 +2177,27 @@ function goToTab(tabName) { if (tabName === 'news') loadNews(1); if (tabName === 'vendors') loadVendors(); if (tabName === 'standards') loadStandardsList(); - if (tabName === 'blog') { loadBlogDrafts(); loadSLLInsights(); loadBlogLLMStatus(); } + if (tabName === 'blog') { loadBlogDrafts(); loadSLLInsights(); loadBlogLLMStatus(); loadPostingTime(); } if (tabName === 'finder') document.getElementById('finder-switch-input').focus(); if (tabName === 'crawlers') loadCrawlerStatus(); if (tabName === 'procurement') loadProcurement(); if (tabName === 'network') loadProxyNetwork(); + if (tabName === 'review') loadReview(); + if (tabName === 'stock') loadStock(); } document.querySelectorAll('.tab').forEach(function(tab) { tab.addEventListener('click', function() { goToTab(tab.dataset.tab); }); }); +// Navigate to Transceivers tab with a verified filter pre-applied +// type: 'price' | 'image' | 'details' | 'full' +function goToVerifiedFilter(type) { + var filterEl = el('tx-verified-filter'); + if (filterEl) filterEl.value = type; + goToTab('transceivers'); +} + // Clickable header stats and overview cards document.querySelectorAll('[data-goto]').forEach(function(elem) { elem.addEventListener('click', function() { goToTab(this.getAttribute('data-goto')); }); @@ -2008,13 +2223,17 @@ async function loadOverview() { var v = h.verification; var total = v.total || 1; var items = [ - { label: 'Price Verified', count: v.price_verified, pct: v.price_coverage_pct || Math.round(v.price_verified / total * 100), color: '#22c55e' }, - { label: 'Image Verified', count: v.image_verified, pct: Math.round(v.image_verified / total * 100), color: '#3b82f6' }, - { label: 'Details Verified', count: v.details_verified, pct: Math.round(v.details_verified / total * 100), color: '#a855f7' }, - { label: 'Fully Verified', count: v.fully_verified, pct: v.fully_verified_pct || Math.round(v.fully_verified / total * 100), color: '#f97316' }, + { label: 'Price Verified', count: v.price_verified, pct: v.price_coverage_pct || Math.round(v.price_verified / total * 100), color: '#22c55e', filter: 'price' }, + { label: 'Image Verified', count: v.image_verified, pct: Math.round(v.image_verified / total * 100), color: '#3b82f6', filter: 'image' }, + { label: 'Details Verified', count: v.details_verified, pct: Math.round(v.details_verified / total * 100), color: '#a855f7', filter: 'details' }, + { label: 'Fully Verified', count: v.fully_verified, pct: v.fully_verified_pct || Math.round(v.fully_verified / total * 100), color: '#f97316', filter: 'full' }, ]; buildDOM(el('verification-overview'), items.map(function(item) { - return '
' + return '
' + '
' + '' + item.label + '' + '' + (item.count || 0).toLocaleString() + '' @@ -2022,7 +2241,7 @@ async function loadOverview() { + '
' + '
' + '
' - + '
' + (item.pct || 0) + '% of ' + total.toLocaleString() + '
' + + '
' + (item.pct || 0) + '% of ' + total.toLocaleString() + ' Β· click to filter β†’
' + '
'; }).join('')); } @@ -2740,22 +2959,25 @@ function searchTransceivers() { var q = el('tx-search').value; var ff = el('tx-ff-filter').value; var vf = el('tx-vendor-filter').value; + var verifiedF = (el('tx-verified-filter') || {}).value || ''; var params = []; if (q) params.push('q=' + encodeURIComponent(q)); if (ff) params.push('form_factor=' + encodeURIComponent(ff)); if (vf) params.push('vendor=' + encodeURIComponent(vf)); + if (verifiedF) params.push('verified=' + encodeURIComponent(verifiedF)); params.push('limit=200'); api('/api/transceivers?' + params.join('&')).then(function(data) { lastTxData = data.data || data.transceivers || []; // Show result count in search bar placeholder var total = data.total || lastTxData.length; - var activeFilter = q || ff || vf; + var activeFilter = q || ff || vf || verifiedF; var txSearchEl = el('tx-search'); if (txSearchEl && !activeFilter) txSearchEl.placeholder = 'Filter: Nexus 9300, QSFP28, 400G, coherent… (' + total + ' transceivers total)'; // Show count above table + clear button var countNote = el('tx-result-count'); - if (countNote) countNote.textContent = 'Showing ' + lastTxData.length + (data.total && data.total > lastTxData.length ? ' of ' + data.total : '') + ' transceivers' + (activeFilter ? ' β€” filter active' : ''); + var filterLabel = verifiedF ? (' β€” ' + (verifiedF === 'full' ? 'β˜… Fully Verified' : verifiedF.charAt(0).toUpperCase() + verifiedF.slice(1) + ' Verified') + ' filter') : (activeFilter ? ' β€” filter active' : ''); + if (countNote) countNote.textContent = 'Showing ' + lastTxData.length + (data.total && data.total > lastTxData.length ? ' of ' + data.total : '') + ' transceivers' + filterLabel; var clearBtn = el('tx-clear-filter'); if (clearBtn) clearBtn.style.display = activeFilter ? '' : 'none'; buildDOM(el('tx-table'), lastTxData.map(function(t) { @@ -2772,10 +2994,20 @@ function searchTransceivers() { + '' + (t.price_tier ? '' + esc(t.price_tier) + '' : 'β€”') + '' + '' + (t.market_status ? '' + esc(t.market_status) + '' : 'β€”') + '' + '' + (t.category ? '' + esc(t.category) + '' : '') + '' - + '' + (t.fully_verified && (t.street_price_usd || t.price_verified_eur) && t.image_verified && t.details_verified - ? 'β˜… 100%' - : t.price_verified && (t.street_price_usd || t.price_verified_eur) ? 'βœ“ Price' : '') - + '' + + (function() { + var pv = !!(t.price_verified && (t.street_price_usd || t.price_verified_eur)); + var iv = !!t.image_verified; + var dv = !!t.details_verified; + var cv = !!t.competitor_verified; + if (pv && iv && dv && cv) return 'β˜… 100%'; + var s = ''; + s += '' + (pv ? 'βœ“' : 'β€”') + 'P'; + s += '' + (iv ? 'βœ“' : 'β€”') + 'I'; + s += '' + (dv ? 'βœ“' : 'β€”') + 'D'; + s += '' + (cv ? 'βœ“' : 'β€”') + 'C'; + s += ''; + return s; + })() + ''; }).join('')); @@ -2876,7 +3108,9 @@ async function openTxDetail(id) { if (pVer) verItems.push('βœ“ Price'); if (iVer) verItems.push('βœ“ Image'); if (dVer) verItems.push('βœ“ Details'); + var noMarket = !cVer && t.competitor_has_product === false && t.last_competitor_scan; if (cVer) verItems.push('βœ“ Competitor'); + else if (noMarket) verItems.push('⚠ Kein Markt'); else verItems.push('⟳ Competitor'); // 100% VERIFIED requires all 4: Price + Image + Details + Competitor var fullyVerified = fVer && cVer; @@ -3040,7 +3274,24 @@ async function openTxDetail(id) { h += '
'; } - // No price_observations β†’ show nothing. Never display estimated prices. + // No competitor prices β†’ show "Kein Markt" info block with last scan date + if (!cVer && t.last_competitor_scan) { + h += '
Wettbewerber-VerfΓΌgbarkeit
'; + h += '
'; + h += 'πŸ”΄'; + h += '
'; + h += '
Kein Wettbewerber bietet dieses Produkt an
'; + h += '
'; + h += 'Keiner unserer ' + (t.speed_gbps >= 800 ? '60+' : '60+') + ' ΓΌberwachten Anbieter (FS.com, ATGBICS, Prolabs, Skylane u.a.) hat ein identisches oder vergleichbares Produkt im Sortiment.'; + h += '
'; + h += '
'; + h += 'πŸ• Letzter Competitor-Scan: ' + fmtDate(t.last_competitor_scan) + ''; + h += '  Β·  Scans laufen tΓ€glich automatisch'; + h += '
'; + h += '
'; + h += '
'; + } + // No price_observations at all β†’ show nothing. Never display estimated prices. // Notes (scraped extra specs) if (t.notes) { @@ -4185,7 +4436,6 @@ function copyLinkedInPost(id) { // BLOG function generateBlog(topic, speed) { - el('blog-list').innerHTML = '
Generating article...
'; var body = { topic: topic }; if (speed) body.speed = speed; fetch(API + '/api/blog/generate', { @@ -4196,12 +4446,70 @@ function generateBlog(topic, speed) { if (data.success) { showToast('βš™οΈ Generating…', data.draft.title + ' β€” pipeline running (~10 min)'); loadBlogDrafts(); - // Always poll progress β€” pipeline runs async pollBlogLlm(data.draft.id, 0); } else showToast('Failed', data.error || 'Unknown error', true); }).catch(function(err) { showToast('Network error', err.message, true); }); } +function toggleBlogReviewed(id, starEl) { + fetch(API + '/api/blog/' + id + '/review', { method: 'PUT', headers: { 'Content-Type': 'application/json' } }) + .then(function(r) { return r.json(); }) + .then(function(data) { + if (!data.success) return; + var isReviewed = data.review_tag === 'reviewed'; + // Update star opacity + starEl.style.opacity = isReviewed ? '1' : '0.3'; + starEl.title = isReviewed ? 'Reviewed β€” klicken zum ZurΓΌcksetzen' : 'Noch nicht reviewed β€” klicken zum Markieren'; + // Update row border + reviewed badge + var row = starEl.closest('.ri'); + if (row) { + row.style.borderLeft = isReviewed ? '3px solid #1a7a3a' : ''; + row.dataset.reviewed = isReviewed ? '1' : '0'; + // Toggle reviewed badge in meta + var existingBadge = row.querySelector('.blog-reviewed-badge'); + if (isReviewed && !existingBadge) { + var meta = row.querySelector('.ri-meta'); + if (meta) { + var badge = document.createElement('span'); + badge.className = 'b b-green blog-reviewed-badge'; + badge.style.cssText = 'background:#1a7a3a22;color:#1a7a3a;border-color:#1a7a3a44'; + badge.textContent = 'βœ“ reviewed'; + meta.appendChild(badge); + } + } else if (!isReviewed && existingBadge) { + existingBadge.remove(); + } + } + showToast(isReviewed ? 'βœ… Reviewed' : '↩ Review zurΓΌckgesetzt', ''); + }) + .catch(function() { showToast('Fehler', 'Review-Status konnte nicht gesetzt werden', true); }); +} + +function generateBlogManual() { + var customTitle = (document.getElementById('blog-custom-title').value || '').trim(); + var topic = document.getElementById('blog-manual-topic').value || 'technology_deep_dive'; + var additionalContext = (document.getElementById('blog-additional-context').value || '').trim(); + + var body = { topic: topic }; + if (customTitle) body.custom_title = customTitle; + if (additionalContext) body.additional_context = additionalContext; + + showToast('βš™οΈ Gestartet', (customTitle || 'Artikel') + ' β€” Pipeline lΓ€uft (~10 min)'); + fetch(API + '/api/blog/generate', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify(body) + }).then(function(r) { return r.json(); }).then(function(data) { + if (data.success) { + showToast('βœ“ Pipeline gestartet', data.draft.title + ' β€” wird in ~10 min fertig'); + document.getElementById('blog-custom-title').value = ''; + document.getElementById('blog-additional-context').value = ''; + loadBlogDrafts(); + pollBlogLlm(data.draft.id, 0); + } else showToast('Fehler', data.error || 'Unbekannter Fehler', true); + }).catch(function(err) { showToast('Netzwerkfehler', err.message, true); }); +} + function pollBlogLlm(id, attempt) { if (attempt > 60) return; // max 10 min (60 Γ— 10s) setTimeout(function() { @@ -4219,6 +4527,8 @@ function pollBlogLlm(id, attempt) { } }).catch(function() {}); loadBlogDrafts(); + // Show posting time recommendation after generation + showPostingTimeForBlog(); } else { // Still running β€” update badge with step info var badge = document.querySelector('.ri[data-blog-id="' + id + '"] .blog-status-badge'); @@ -4495,11 +4805,14 @@ async function loadBlogDrafts() { } var gen = (d.generated_by || '').replace('tip-blog-engine-', ''); var gc = gen.includes('fo-blog') ? 'b-green' : gen === 'template-fallback' ? 'b-yellow' : 'b-neutral'; - return '
' + // Review tracking: 'reviewed' tag stored in d.review_tag (set via toggleBlogReviewed) + var isReviewed = d.review_tag === 'reviewed'; + return '
' + '
' + '
' + esc(d.title) + '
' + '
' + '' + statusLabel + '' + + 'βœ…' + 'βœ•' + '
' + '
' @@ -4509,6 +4822,7 @@ async function loadBlogDrafts() { + '' + esc(gen || 'template') + '' + '' + esc(d.word_count) + ' words' + (d.linkedin_post ? '👤 LI' : '') + + (isReviewed ? 'βœ“ reviewed' : '') + '' + new Date(d.created_at).toLocaleString('de-DE', {day:'2-digit',month:'2-digit',year:'numeric',hour:'2-digit',minute:'2-digit'}) + '' + '
'; }).join('') || '
No drafts yet — click a card above to generate
'); @@ -4840,6 +5154,131 @@ async function sllAnalyze() { } } +// ══════════════════════════════════════════════════════ +// POSTING TIME β€” Umami + SLL combined recommendation +// ══════════════════════════════════════════════════════ + +async function loadPostingTime() { + var token = window.loadToken ? window.loadToken() : ''; + var el = document.getElementById('posting-time-content'); + var badge = document.getElementById('posting-time-badge'); + if (!el) return; + try { + var r = await fetch(API + '/api/blog/sll/posting-time', { headers: { 'Authorization': 'Bearer ' + token } }); + var d = await r.json(); + if (!d.success) { el.innerHTML = 'Fehler beim Laden'; return; } + + var rec = d.recommended; + var top = d.top_slots || []; + var ds = d.data_sources || {}; + + // Badge + badge.textContent = rec ? rec.label : 'keine Daten'; + badge.style.background = 'rgba(99,102,241,0.2)'; + badge.style.color = '#a5b4fc'; + + var h = ''; + + // Recommended slot big display + if (rec) { + h += '
'; + h += '
' + rec.label + '
'; + h += '
'; + h += '
Optimaler Zeitslot Β· Score ' + rec.score + '/100
'; + h += '
'; + if (rec.umami_sessions > 0) h += 'πŸ“Š ' + rec.umami_sessions + ' Umami-Sessions  '; + if (rec.sll_avg_engagement !== null) h += '🧠 SLL βŒ€' + rec.sll_avg_engagement + ' Score'; + h += '
'; + h += '
'; + h += '
'; + } + + // Top 5 slots bar chart + if (top.length > 0) { + var maxScore = top[0].score || 1; + h += '
Top Zeitslots
'; + h += '
'; + top.slice(0, 5).forEach(function(slot, i) { + var pct = Math.round((slot.score / maxScore) * 100); + var isTop = i === 0; + h += '
'; + h += '
' + slot.label + '
'; + h += '
'; + h += '
'; + h += '
'; + h += '
' + slot.score + '
'; + var srcIcons = (slot.data_sources || []).map(function(s) { return s === 'umami' ? 'πŸ“Š' : '🧠'; }).join(''); + h += '
' + srcIcons + '
'; + h += '
'; + }); + h += '
'; + } + + // Data sources footnote + h += '
'; + h += 'πŸ“Š Umami: ' + (ds.umami_sessions_analyzed || 0) + ' Sessions (90d)'; + if (ds.umami_cache_age_min !== null && ds.umami_cache_age_min !== undefined) h += ' Β· Cache: ' + ds.umami_cache_age_min + ' min alt'; + h += '  |  🧠 SLL: ' + (ds.sll_posts_with_time || 0) + ' Posts mit Zeit'; + h += '
'; + + if (d.note) { + h += '
' + d.note + '
'; + } + + el.innerHTML = h; + + // Store recommended globally for use in showPostingTimeForBlog + window._lastPostingTimeRec = rec; + } catch(e) { + if (el) el.innerHTML = 'Posting-Zeit-Daten nicht verfΓΌgbar'; + } +} + +function showPostingTimeForBlog() { + var rec = window._lastPostingTimeRec; + var highlight = document.getElementById('posting-time-highlight'); + var recEl = document.getElementById('posting-time-recommended'); + var reasonEl = document.getElementById('posting-time-reason'); + if (!highlight || !recEl) return; + + if (rec) { + recEl.textContent = 'πŸ“… ' + rec.label; + var parts = []; + if (rec.umami_sessions > 0) parts.push(rec.umami_sessions + ' Umami-Sessions'); + if (rec.sll_avg_engagement !== null) parts.push('SLL βŒ€' + rec.sll_avg_engagement); + reasonEl.textContent = parts.length > 0 ? 'Basis: ' + parts.join(' Β· ') + ' Β· Score ' + rec.score + '/100' : 'Score ' + rec.score + '/100'; + highlight.style.display = 'block'; + } else { + // Re-fetch in case it wasn't loaded yet + loadPostingTime().then(function() { + if (window._lastPostingTimeRec) showPostingTimeForBlog(); + }); + } +} + +async function syncUmami() { + var token = window.loadToken ? window.loadToken() : ''; + var btn = document.getElementById('umami-sync-btn'); + if (btn) { btn.textContent = '⏳ Syncing…'; btn.disabled = true; } + try { + var r = await fetch(API + '/api/blog/sll/sync-umami', { + method: 'POST', + headers: { 'Content-Type': 'application/json', 'Authorization': 'Bearer ' + token } + }); + var d = await r.json(); + if (d.success) { + showToast('Umami synced βœ“', d.total_sessions + ' Sessions aus ' + d.slots_loaded + ' Slots geladen'); + loadPostingTime(); + } else { + showToast('Fehler', d.message || 'Umami nicht erreichbar', true); + } + } catch(e) { + showToast('Error', e.message, true); + } finally { + if (btn) { btn.textContent = '↻ Umami'; btn.disabled = false; } + } +} + // TABLE SORTING function makeSortable(table) { if (!table) return; @@ -5172,6 +5611,11 @@ async function loadProcLifecycle() { // INIT loadOverview(); loadChangelog(); +searchTransceivers(); // pre-load transceivers table on startup +// Review stats: only after confirming auth (loadOverview sets up auth state) +setTimeout(function() { + if (window.loadToken && window.loadToken()) loadReviewStats().catch(function() {}); +}, 1500); // ── CRAWLER INTELLIGENCE ──────────────────────────────────────────── async function loadCrawlerStatus() { @@ -5546,6 +5990,442 @@ function copyInstallCmd(el) { setTimeout(function() { el.style.borderColor = orig; }, 1000); }).catch(function() {}); } + +// ══════════════════════════════════════════════════════════════════════════════ +// REVIEW TAB β€” Manual equivalence review queue +// ══════════════════════════════════════════════════════════════════════════════ + +var reviewState = { filter: 'pending', page: 1, total: 0, loaded: false }; + +async function loadReview() { + if (!reviewState.loaded) { + await loadReviewStats(); + reviewState.loaded = true; + } + await loadReviewPage(1); +} + +async function loadReviewStats() { + try { + var s = await api('/api/review/equivalences/stats'); + var stats = s.stats || {}; + // Update pending badge in tab nav + var badge = el('review-pending-badge'); + if (badge) { + if (stats.pending > 0) { + badge.textContent = stats.pending; + badge.style.display = ''; + } else { + badge.style.display = 'none'; + } + } + // Stat pills inside review tab + buildDOM(el('review-stat-pills'), [ + { label: 'Pending', count: stats.pending, color: '#f97316' }, + { label: 'Auto-Approved', count: stats.auto_approved, color: '#22c55e' }, + { label: 'Approved', count: stats.approved, color: '#6366f1' }, + { label: 'Rejected', count: stats.rejected, color: '#ef4444' }, + { label: 'Re-Research', count: stats.needs_research, color: '#f59e0b' }, + ].map(function(p) { + return '' + + '' + (p.count||0) + ' ' + p.label + ''; + }).join('')); + // Update Re-Research filter badge + var nrBadge = el('needs-research-badge'); + if (nrBadge) { + var nrCount = stats.needs_research || 0; + nrBadge.textContent = nrCount; + nrBadge.style.display = nrCount > 0 ? '' : 'none'; + } + } catch(e) {} +} + +async function loadReviewPage(page) { + reviewState.page = page; + var list = el('review-list'); + var empty = el('review-empty'); + var more = el('review-load-more'); + if (page === 1) buildDOM(list, ''); + + try { + var data = await api('/api/review/equivalences?status=' + reviewState.filter + '&page=' + page + '&limit=30'); + var items = data.data || []; + reviewState.total = data.total || 0; + + if (items.length === 0 && page === 1) { + if (empty) empty.style.display = ''; + if (more) more.style.display = 'none'; + return; + } + if (empty) empty.style.display = 'none'; + + var html = items.map(function(eq) { + return buildReviewCard(eq); + }).join(''); + list.insertAdjacentHTML('beforeend', html); + + var shown = (page - 1) * 30 + items.length; + if (more) more.style.display = shown < reviewState.total ? '' : 'none'; + } catch(e) { + if (list) list.innerHTML = '
Error loading review queue: ' + esc(String(e)) + '
'; + } +} + +function buildReviewCard(eq) { + var conf = parseFloat(eq.confidence || 0); + var confPct = Math.round(conf * 100); + var confColor = conf >= 0.85 ? '#22c55e' : conf >= 0.65 ? '#f97316' : '#ef4444'; + var statusColor = { pending: '#f97316', approved: '#6366f1', auto_approved: '#22c55e', rejected: '#ef4444' }; + var sc = statusColor[eq.status] || 'var(--text-dim)'; + + var basis = (eq.match_basis || []).join(' Β· '); + var fxUrl = eq.fx_url ? 'β†— product page' : ''; + var cpUrl = eq.cp_url ? 'β†— product page' : ''; + var cpPrice = eq.cp_latest_price + ? '' + parseFloat(eq.cp_latest_price).toFixed(2) + ' ' + (eq.cp_latest_currency||'') + '' + : 'no price'; + + var reResearchBadge = eq.re_research_due_at + ? '⏳ Re-Research' + : ''; + + var actionBtns = ''; + if (eq.status === 'pending') { + actionBtns = '
' + + '' + + '' + + '' + + '
'; + } else if (eq.status === 'approved' || eq.status === 'auto_approved') { + var reResearchInfo = eq.re_research_due_at + ? '⏳ Due ' + new Date(eq.re_research_due_at).toLocaleDateString() + (eq.re_researched_at ? ' · last checked ' + new Date(eq.re_researched_at).toLocaleDateString() : '') + '' + : ''; + actionBtns = '
' + + 'βœ“ ' + (eq.status === 'auto_approved' ? 'Auto-approved' : 'Approved by ' + esc(eq.reviewed_by||'β€”')) + '' + + reResearchInfo + + '' + + '
'; + } else if (eq.status === 'rejected') { + actionBtns = '
' + + 'βœ• Rejected' + (eq.reject_reason ? ': ' + esc(eq.reject_reason) : '') + '' + + '' + + '
'; + } + + return '
' + // Header: status + confidence + + '
' + + '
' + + '' + esc(eq.status.replace('_',' ')) + '' + + reResearchBadge + + '
' + + '
' + + 'Confidence' + + '' + confPct + '%' + + '
' + + '
' + // Two-column comparison + + '
' + // Flexoptix side + + '
' + + '
Flexoptix
' + + '
' + esc(eq.fx_part_number||'β€”') + '
' + + (eq.fx_standard_name ? '
' + esc(eq.fx_standard_name) + '
' : '') + + '
' + + '' + esc(eq.fx_form_factor||'') + ' ' + + '' + esc(eq.fx_speed||'') + ' ' + + (eq.fx_reach_label ? '' + esc(eq.fx_reach_label) + '' : '') + + (eq.fx_fiber_type ? ' Β· ' + esc(eq.fx_fiber_type) : '') + + '
' + + (fxUrl ? '
' + fxUrl + '
' : '') + + '
' + // Arrow + + '
↔
' + // Competitor side + + '
' + + '
' + esc(eq.cp_vendor||'Competitor') + '
' + + '
' + esc(eq.cp_part_number||'β€”') + '
' + + (eq.cp_standard_name ? '
' + esc(eq.cp_standard_name) + '
' : '') + + '
' + + '' + esc(eq.cp_form_factor||'') + ' ' + + '' + esc(eq.cp_speed||'') + ' ' + + (eq.cp_reach_label ? '' + esc(eq.cp_reach_label) + '' : '') + + (eq.cp_fiber_type ? ' Β· ' + esc(eq.cp_fiber_type) : '') + + '
' + + '
Price: ' + cpPrice + '
' + + (cpUrl ? '
' + cpUrl + '
' : '') + + '
' + + '
' + // Match basis + + '
Match basis: ' + esc(basis||'β€”') + '
' + // Notes + + (eq.match_notes ? '
' + esc(eq.match_notes) + '
' : '
') + // Action buttons + + actionBtns + + '
'; +} + +function setReviewFilter(f) { + reviewState.filter = f; + reviewState.loaded = false; + document.querySelectorAll('.review-filter-btn').forEach(function(b) { + b.classList.toggle('active', b.dataset.rfilter === f); + b.style.background = b.dataset.rfilter === f ? 'var(--indigo)' : 'var(--surface2)'; + b.style.color = b.dataset.rfilter === f ? '#fff' : 'var(--text)'; + b.style.borderColor= b.dataset.rfilter === f ? 'var(--indigo)' : 'var(--border)'; + }); + loadReviewPage(1).then(function() { + buildDOM(el('review-list'), ''); + loadReviewPage(1); + }); +} + +async function approveEquivalence(id, btn) { + btn.disabled = true; + btn.textContent = '…'; + try { + var r = await api('/api/review/equivalences/' + id + '/approve', { method: 'POST', headers: {'Content-Type':'application/json'}, body: JSON.stringify({reviewer: 'dashboard'}) }); + var card = el('eq-card-' + id); + if (card) { + card.style.borderColor = '#22c55e'; + card.style.opacity = '0.7'; + setTimeout(function() { if (card) card.remove(); }, 800); + } + await loadReviewStats(); + if (r.fully_verified_earned) { + console.log('[review] β˜… Fully Verified earned for transceiver!'); + } + } catch(e) { + btn.disabled = false; + btn.textContent = 'βœ“ Approve'; + } +} + +async function rejectEquivalence(id, btn) { + var reason = prompt('Rejection reason (optional):') ?? ''; + btn.disabled = true; + btn.textContent = '…'; + try { + await api('/api/review/equivalences/' + id + '/reject', { method: 'POST', headers: {'Content-Type':'application/json'}, body: JSON.stringify({reason: reason, reviewer: 'dashboard'}) }); + var card = el('eq-card-' + id); + if (card) { + card.style.borderColor = '#ef4444'; + card.style.opacity = '0.7'; + setTimeout(function() { if (card) card.remove(); }, 800); + } + await loadReviewStats(); + } catch(e) { + btn.disabled = false; + btn.textContent = 'βœ• Reject'; + } +} + +async function editEquivNotes(id, btn) { + var notesEl = el('eq-notes-' + id); + var current = notesEl ? notesEl.textContent.trim() : ''; + var newNotes = prompt('Edit notes:', current); + if (newNotes === null) return; + try { + await api('/api/review/equivalences/' + id, { method: 'PATCH', headers: {'Content-Type':'application/json'}, body: JSON.stringify({match_notes: newNotes}) }); + if (notesEl) notesEl.textContent = newNotes; + } catch(e) { + alert('Save failed: ' + e); + } +} + +async function bulkApproveHighConfidence() { + var btn = document.getElementById('bulk-approve-btn'); + if (!btn || btn.disabled) return; + btn.disabled = true; + btn.textContent = '⏳ Approving…'; + try { + var r = await api('/api/review/equivalences/bulk-approve', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ threshold: 0.73, reviewer: 'bulk-dashboard' }) + }); + btn.textContent = 'βœ“ ' + r.approved + ' approved!'; + if (r.fully_verified_earned > 0) { + showToast('Bulk Approve', r.approved + ' matches approved Β· ' + r.fully_verified_earned + ' Γ— β˜… Fully Verified earned', false); + } else { + showToast('Bulk Approve', r.approved + ' matches β‰₯74% approved', false); + } + reviewState.loaded = false; + await loadReview(); + setTimeout(function() { + btn.disabled = false; + btn.textContent = 'βœ“ Bulk-Approve β‰₯73%'; + }, 3000); + } catch(e) { + showToast('Bulk Approve fehlgeschlagen', e.message || 'Fehler', true); + btn.disabled = false; + btn.textContent = 'βœ“ Bulk-Approve β‰₯73%'; + } +} + +async function approveAll() { + var btn = document.getElementById('approve-all-btn'); + if (!btn || btn.disabled) return; + var pending = reviewState.filter === 'pending' ? reviewState.total : '?'; + if (!confirm('Approve ALL pending equivalences? Low-confidence matches (<73%) will be flagged for re-research.')) return; + btn.disabled = true; + btn.textContent = '⏳ Approving…'; + try { + var r = await api('/api/review/equivalences/approve-all', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ reviewer: 'approve-all-dashboard' }) + }); + btn.textContent = 'βœ“ ' + r.approved + ' approved!'; + var msg = r.approved + ' matches approved'; + if (r.scheduled_re_research > 0) msg += ' Β· ' + r.scheduled_re_research + ' scheduled for re-research'; + if (r.fully_verified_earned > 0) msg += ' Β· ' + r.fully_verified_earned + ' Γ— β˜… Fully Verified'; + showToast('Approve All', msg, false); + reviewState.loaded = false; + await loadReview(); + setTimeout(function() { + btn.disabled = false; + btn.textContent = '⚑ Approve All Pending'; + }, 4000); + } catch(e) { + showToast('Approve All fehlgeschlagen', e.message || 'Fehler', true); + btn.disabled = false; + btn.textContent = '⚑ Approve All Pending'; + } +} + +async function runEquivalenceMatcher() { + var btn = document.querySelector('[onclick="runEquivalenceMatcher()"]'); + if (btn) { btn.disabled = true; btn.textContent = 'Queued βœ“'; } + try { + await api('/api/review/run-matcher', { method: 'POST' }); + setTimeout(function() { + if (btn) { btn.disabled = false; btn.textContent = 'β–Ά Run Matcher Now'; } + reviewState.loaded = false; + loadReview(); + }, 3000); + } catch(e) { + if (btn) { btn.disabled = false; btn.textContent = 'β–Ά Run Matcher Now'; } + } +} + +// ─── STOCK TAB ──────────────────────────────────────────────────────────────── +var stockLoaded = false; + +async function loadStock() { + if (stockLoaded) return; // already loaded β€” use Refresh button to force reload + stockLoaded = false; // allow reloads via Refresh button + try { + var data = await api('/api/stock/summary'); + if (!data.success) return; + var d = data.data; + var t = d.totals; + + // Stat cards + function setEl(id, v) { var e = el(id); if (e) e.textContent = v; } + setEl('stock-stat-skus', Number(t.unique_transceivers || 0).toLocaleString()); + setEl('stock-stat-instock', Number(t.in_stock_count || 0).toLocaleString()); + setEl('stock-stat-de', Number(t.total_de_qty || 0).toLocaleString()); + setEl('stock-stat-global', Number(t.total_global_qty || 0).toLocaleString()); + setEl('stock-stat-backorder', Number(t.total_backorder_qty || 0).toLocaleString()); + + // Top sellers table + var tbody = el('stock-top-sellers-body'); + if (tbody) { + if (d.top_sellers && d.top_sellers.length > 0) { + tbody.innerHTML = d.top_sellers.map(function(r) { + return '' + + '' + esc(r.part_number) + '' + + '' + esc(r.form_factor || 'β€”') + '' + + '' + Number(r.units_sold || 0).toLocaleString() + '' + + '' + (r.warehouse_de_qty != null ? Number(r.warehouse_de_qty).toLocaleString() : 'β€”') + '' + + '' + (r.warehouse_global_qty != null ? Number(r.warehouse_global_qty).toLocaleString() : 'β€”') + '' + + '' + (r.price_net != null ? '€' + Number(r.price_net).toFixed(2) : 'β€”') + '' + + ''; + }).join(''); + } else { + tbody.innerHTML = 'No data yet β€” waiting for first scrape run'; + } + } + + // Vendor breakdown + var vbody = el('stock-vendor-body'); + if (vbody) { + if (d.vendor_breakdown && d.vendor_breakdown.length > 0) { + vbody.innerHTML = d.vendor_breakdown.map(function(r) { + var lastScraped = r.last_scraped ? new Date(r.last_scraped).toLocaleString('de-DE', {day:'2-digit',month:'2-digit',hour:'2-digit',minute:'2-digit'}) : 'β€”'; + return '' + + '' + esc(r.vendor_name) + '' + + '' + Number(r.product_count).toLocaleString() + '' + + '' + Number(r.in_stock_count).toLocaleString() + '' + + '' + Number(r.total_de_qty || 0).toLocaleString() + '' + + '' + Number(r.total_global_qty || 0).toLocaleString() + '' + + '' + Number(r.total_backorder || 0).toLocaleString() + '' + + '' + lastScraped + '' + + ''; + }).join(''); + } else { + vbody.innerHTML = 'No data yet'; + } + } + + // Recently restocked + var recentEl = el('stock-recent'); + if (recentEl) { + if (d.recently_updated && d.recently_updated.length > 0) { + recentEl.innerHTML = '
' + + d.recently_updated.map(function(r) { + var timeStr = r.observed_at ? new Date(r.observed_at).toLocaleTimeString('de-DE', {hour:'2-digit',minute:'2-digit'}) : ''; + return '' + + '' + esc(r.part_number) + '' + + ' ' + esc(r.form_factor || '') + '' + + ' DE:' + (r.warehouse_de_qty || 0) + '' + + ' GL:' + (r.warehouse_global_qty || 0) + '' + + (timeStr ? ' @' + timeStr + '' : '') + + ''; + }).join('') + + '
'; + } else { + recentEl.textContent = 'No restock events in the last 24 hours'; + } + } + + stockLoaded = true; + } catch(e) { + console.error('loadStock error', e); + } +} + +async function lookupStock() { + var input = el('stock-lookup-input'); + var resultEl = el('stock-lookup-result'); + if (!input || !resultEl) return; + var q = (input.value || '').trim(); + if (!q) return; + resultEl.textContent = 'Looking up…'; + try { + var data = await api('/api/stock/' + encodeURIComponent(q) + '?days=7&limit=5'); + if (!data.success) { resultEl.textContent = 'Not found: ' + q; return; } + var obs = data.data.observations; + var tx = data.data.transceiver; + if (!obs || obs.length === 0) { + resultEl.innerHTML = '' + esc(tx.part_number) + ' found but no stock observations in last 7 days.'; + return; + } + var latest = obs[0]; + resultEl.innerHTML = '' + esc(tx.part_number) + ' β€” ' + + esc(tx.form_factor || '') + ' ' + esc(tx.speed || '') + '
' + + 'DE-Lager: ' + (latest.warehouse_de_qty != null ? latest.warehouse_de_qty : 'β€”') + ' Β· ' + + 'Global: ' + (latest.warehouse_global_qty != null ? latest.warehouse_global_qty : 'β€”') + ' Β· ' + + 'Nachlieferung: ' + (latest.backorder_qty != null ? latest.backorder_qty : 'β€”') + ' Β· ' + + (latest.price_net != null ? '€' + Number(latest.price_net).toFixed(2) + ' (net)' : '') + + (latest.units_sold != null ? ' Β· ' + latest.units_sold + 'Γ— verkauft' : '') + + '
via ' + esc(latest.vendor_name) + ' Β· ' + new Date(latest.time).toLocaleString('de-DE') + '' + + (obs.length > 1 ? ' (' + obs.length + ' observations this week)' : ''); + } catch(e) { + resultEl.textContent = 'Error: ' + e.message; + } +} diff --git a/packages/scraper/src/scrapers/fs-com.ts b/packages/scraper/src/scrapers/fs-com.ts index e31fd6c..13af993 100644 --- a/packages/scraper/src/scrapers/fs-com.ts +++ b/packages/scraper/src/scrapers/fs-com.ts @@ -1,21 +1,74 @@ /** - * FS.com Scraper β€” Prices, Stock, Product Catalog + * FS.com Scraper v2 β€” Warehouse Data, Prices, Product Catalog * - * FS.com renders products client-side (JS), so we use PlaywrightCrawler. - * Categories: /c/optical-transceivers-9 + * Phase 1: Category listing pages β†’ collect product URLs (paginated) + * Phase 2: Product detail pages β†’ extract warehouse breakdown, net price, specs + * Phase 3: Write to PostgreSQL (price_observations + stock_observations) * - * Respects: robots.txt, rate limiting (2s between requests) + * Uses German locale (www.fs.com/de/) for EUR prices and German warehouse labels: + * DE-Lager β†’ warehouse_de_qty + warehouse_de_delivery_date + * Global-Lager β†’ warehouse_global_qty + warehouse_global_delivery_date + * Nachlieferung β†’ backorder_qty + backorder_estimated_date + * verkauft β†’ units_sold + * + * Respects robots.txt and rate limits (≀12 req/min listing, ≀10 req/min detail). */ import { PlaywrightCrawler, ProxyConfiguration } from "crawlee"; -import { ensureVendor, upsertPriceObservation, findOrCreateScrapedTransceiver, pool } from "../utils/db"; -import { contentHash, parsePrice, parseStockLevel, parseQuantity } from "../utils/hash"; +import type { Page } from "playwright"; + +/** Apply stealth patches to evade Cloudflare TLS/bot fingerprinting */ +async function applyStealthPatches(page: Page): Promise { + await page.addInitScript(() => { + // Remove webdriver flag + Object.defineProperty(navigator, "webdriver", { get: () => undefined }); + // Mimic Chrome plugins (empty list = detected as bot) + Object.defineProperty(navigator, "plugins", { + get: () => ({ + 0: { name: "Chrome PDF Plugin", filename: "internal-pdf-viewer", description: "Portable Document Format", length: 1 }, + length: 1, + namedItem: (n: string) => null, + refresh: () => {}, + item: (i: number) => null, + [Symbol.iterator]: function* () { yield (this as any)[0]; } + }) + }); + // Real Chrome languages + Object.defineProperty(navigator, "languages", { get: () => ["de-DE", "de", "en-US", "en"] }); + // Non-zero hardware concurrency + Object.defineProperty(navigator, "hardwareConcurrency", { get: () => 8 }); + // Permissions API + const originalQuery = window.navigator.permissions?.query; + if (originalQuery) { + (window.navigator.permissions as any).query = (params: any) => + params.name === "notifications" + ? Promise.resolve({ state: Notification.permission } as PermissionStatus) + : originalQuery.call(navigator.permissions, params); + } + // Chrome object (headless detection) + (window as any).chrome = { runtime: {}, loadTimes: () => {}, csi: () => {}, app: {} }; + // Hide automation-specific properties + delete (window as any).__playwright; + delete (window as any).__pwInitScripts; + }); +} +import { + ensureVendor, + upsertPriceObservation, + upsertStockObservation, + findOrCreateScrapedTransceiver, + pool, +} from "../utils/db"; +import { contentHash } from "../utils/hash"; import { updateVerifiedSpecs, parseSpecTable } from "../utils/spec-updater"; -// Use German store (de.fs.com) β€” scrapes EUR prices directly from the website. -// EUR is the primary price; USD is derived (EUR β†’ USD), never the reverse. -const BASE_URL = "https://de.fs.com"; +// ── Constants ────────────────────────────────────────────────────────────────── + +// FS.com German locale uses www.fs.com/de/ (de.fs.com DNS does not exist) +const BASE_URL = "https://www.fs.com/de"; +const MAX_PAGES_PER_CATEGORY = 10; +const MAX_DETAIL_PAGES_PER_RUN = 300; +const STOCK_FRESH_HOURS = 12; -// SOCKS5 proxy pool β€” rotate across CT130/131/132 to avoid IP blocks const PROXY_URLS = (process.env["PROXY_URLS"] ?? "") .split(",") .map((u) => u.trim()) @@ -41,48 +94,135 @@ const CATEGORY_URLS = [ "/c/100g-dwdm-qsfp28-3863", ]; -interface FsProduct { - partNumber: string; - name: string; - price: number; - currency: string; - stockLevel: string; - quantity?: number; - url: string; - formFactor?: string; - speedGbps?: number; - speed?: string; - reachLabel?: string; +const DE_COOKIES = [ + { name: "currency", value: "EUR", domain: ".fs.com", path: "/" }, + { name: "lang", value: "de", domain: ".fs.com", path: "/" }, + { name: "country", value: "DE", domain: ".fs.com", path: "/" }, +]; + +// ── German locale parsers ────────────────────────────────────────────────────── + +const GERMAN_MONTHS: Record = { + jan: "01", feb: "02", mΓ€r: "03", mar: "03", + apr: "04", mai: "05", may: "05", jun: "06", + jul: "07", aug: "08", sep: "09", okt: "10", + oct: "10", nov: "11", dez: "12", dec: "12", +}; + +/** + * Parse German-formatted quantity string. + * "4.895" β†’ 4895 (period = thousands separator in German) + * "210.9K" β†’ 210900 + * "1.2M" β†’ 1200000 + */ +function parseGermanQty(text: string): number | undefined { + const t = text.trim().replace(/\s/g, ""); + if (!t) return undefined; + + const kMatch = t.match(/^([\d.,]+)[Kk]$/); + if (kMatch) { + const n = parseFloat(kMatch[1].replace(/\./g, "").replace(",", ".")); + return isNaN(n) ? undefined : Math.round(n * 1_000); + } + + const mMatch = t.match(/^([\d.,]+)[Mm]$/); + if (mMatch) { + const n = parseFloat(mMatch[1].replace(/\./g, "").replace(",", ".")); + return isNaN(n) ? undefined : Math.round(n * 1_000_000); + } + + const n = parseInt(t.replace(/\./g, "").replace(/,/g, ""), 10); + return isNaN(n) ? undefined : n; } +/** + * Parse German date to ISO "YYYY-MM-DD". + * "20 Apr., 2026" β†’ "2026-04-20" + * "20.04.2026" β†’ "2026-04-20" + */ +function parseGermanDate(text: string): string | undefined { + const numericMatch = text.match(/(\d{1,2})\.(\d{1,2})\.(\d{4})/); + if (numericMatch) { + const [, d, m, y] = numericMatch; + return `${y}-${m.padStart(2, "0")}-${d.padStart(2, "0")}`; + } + const wordMatch = text.match(/(\d{1,2})\.?\s+([A-Za-zΓ„Γ–ΓœΓ€ΓΆΓΌΓŸ]+)\.?,?\s*(\d{4})/); + if (!wordMatch) return undefined; + const day = wordMatch[1].padStart(2, "0"); + const monthRaw = wordMatch[2] + .toLowerCase() + .replace(/Γ€/g, "a").replace(/ΓΆ/g, "o").replace(/ΓΌ/g, "u") + .slice(0, 3); + const month = GERMAN_MONTHS[monthRaw]; + if (!month) return undefined; + return `${wordMatch[3]}-${month}-${day}`; +} + +/** + * Parse German price to EUR float. + * "42,50" β†’ 42.50 + * "1.063,02" β†’ 1063.02 + */ +function parseGermanPrice(raw: string): number | undefined { + const cleaned = raw.replace(/[^0-9.,]/g, "").trim(); + if (!cleaned) return undefined; + let normalized: string; + if (/\d+\.\d{3},\d{2}/.test(cleaned)) { + normalized = cleaned.replace(/\./g, "").replace(",", "."); + } else if (cleaned.includes(",")) { + normalized = cleaned.replace(",", "."); + } else { + normalized = cleaned; + } + const n = parseFloat(normalized); + return isNaN(n) || n <= 0 ? undefined : n; +} + +// ── Stock level helper ───────────────────────────────────────────────────────── + +function deriveStockLevel( + deQty?: number, + globalQty?: number, + backorderQty?: number +): "in_stock" | "low_stock" | "out_of_stock" | "on_request" { + const total = (deQty ?? 0) + (globalQty ?? 0); + if (total > 100) return "in_stock"; + if (total > 0) return "low_stock"; + if ((backorderQty ?? 0) > 0) return "on_request"; + return "out_of_stock"; +} + +// ── Product classification ───────────────────────────────────────────────────── + function detectFormFactor(text: string): string | undefined { - const lower = text.toLowerCase(); - if (lower.includes("osfp") && !lower.includes("qsfp")) return "OSFP"; - if (lower.includes("qsfp-dd800") || lower.includes("qsfp-dd 800")) return "QSFP-DD800"; - if (lower.includes("qsfp-dd")) return "QSFP-DD"; - if (lower.includes("qsfp56")) return "QSFP56"; - if (lower.includes("qsfp28")) return "QSFP28"; - if (lower.includes("qsfp+") || lower.includes("qsfp plus")) return "QSFP+"; - if (lower.includes("sfp56")) return "SFP56"; - if (lower.includes("sfp28")) return "SFP28"; - if (lower.includes("sfp+") || lower.includes("sfp plus")) return "SFP+"; - if (lower.includes("sfp") && !lower.includes("qsfp")) return "SFP"; - if (lower.includes("cfp2")) return "CFP2"; - if (lower.includes("xfp")) return "XFP"; + const l = text.toLowerCase(); + if (l.includes("osfp") && !l.includes("qsfp")) return "OSFP"; + if (l.includes("qsfp-dd800") || l.includes("qsfp-dd 800")) return "QSFP-DD800"; + if (l.includes("qsfp-dd")) return "QSFP-DD"; + if (l.includes("qsfp56")) return "QSFP56"; + if (l.includes("qsfp28")) return "QSFP28"; + if (l.includes("qsfp+") || l.includes("qsfp plus")) return "QSFP+"; + if (l.includes("sfp56")) return "SFP56"; + if (l.includes("sfp28")) return "SFP28"; + if (l.includes("sfp+") || l.includes("sfp plus")) return "SFP+"; + if (l.includes("sfp") && !l.includes("qsfp")) return "SFP"; + if (l.includes("cfp2")) return "CFP2"; + if (l.includes("xfp")) return "XFP"; return undefined; } function detectSpeed(text: string): { speed: string; speedGbps: number } | undefined { const patterns: [RegExp, string, number][] = [ + [/1\.6\s*t/i, "1.6T", 1600], [/800\s*g/i, "800G", 800], [/400\s*g/i, "400G", 400], [/200\s*g/i, "200G", 200], [/100\s*g/i, "100G", 100], - [/50\s*g/i, "50G", 50], - [/40\s*g/i, "40G", 40], - [/25\s*g/i, "25G", 25], - [/10\s*g/i, "10G", 10], - [/1\s*g\b/i, "1G", 1], + [/50\s*g/i, "50G", 50], + [/40\s*g/i, "40G", 40], + [/25\s*g/i, "25G", 25], + [/10\s*g/i, "10G", 10], + [/\b1\s*g\b/i, "1G", 1], ]; for (const [re, speed, gbps] of patterns) { if (re.test(text)) return { speed, speedGbps: gbps }; @@ -91,301 +231,524 @@ function detectSpeed(text: string): { speed: string; speedGbps: number } | undef } function detectReach(text: string): string | undefined { - const match = text.match(/(\d+)\s*(m|km)\b/i); - if (match) return `${match[1]}${match[2].toLowerCase()}`; - return undefined; + const m = text.match(/(\d+)\s*(m|km)\b/i); + return m ? `${m[1]}${m[2].toLowerCase()}` : undefined; } -export async function scrapeFs(): Promise { - console.log("=== FS.com Scraper Starting ===\n"); +// ── Types ────────────────────────────────────────────────────────────────────── - const vendorId = await ensureVendor( - "FS.COM", - "compatible", - "https://www.fs.com", - "https://www.fs.com/c/optical-transceivers-9" - ); - console.log(`Vendor ID: ${vendorId}`); +interface ProductSummary { + url: string; + name: string; + partNumber: string; +} - const products: FsProduct[] = []; - let pagesScraped = 0; +interface ProductDetail extends ProductSummary { + priceNet?: number; + deQty?: number; + deDeliveryDate?: string; + globalQty?: number; + globalDeliveryDate?: string; + backorderQty?: number; + backorderDate?: string; + unitsSold?: number; + compatibleBrands: string[]; + specs: Record; + imageUrl?: string; + datasheetUrl?: string; +} + +// ── Phase 1: Collect product URLs ────────────────────────────────────────────── + +/** + * Visit all category pages (paginated) and return a Map of product URL β†’ summary. + * Pages are queued in round-robin order (all p1s, then all p2s, …) so an + * exhausted category is detected before we waste further requests on it. + */ +async function collectProductUrls( + proxyConfiguration: ProxyConfiguration | undefined +): Promise> { + const products = new Map(); + const exhausted = new Set(); + + // Pre-queue: all page-1s, then all page-2s, … + const listingRequests = Array.from({ length: MAX_PAGES_PER_CATEGORY }, (_, i) => + CATEGORY_URLS.map((cat) => ({ + url: i === 0 ? `${BASE_URL}${cat}` : `${BASE_URL}${cat}?page=${i + 1}`, + userData: { catPath: cat, pageNum: i + 1 }, + uniqueKey: `listing-${cat}-p${i + 1}`, + })) + ).flat(); - const proxyConfiguration = buildProxyConfiguration(); const crawler = new PlaywrightCrawler({ maxConcurrency: 1, - maxRequestsPerMinute: 15, + maxRequestsPerMinute: 12, requestHandlerTimeoutSecs: 60, headless: true, + useSessionPool: false, ...(proxyConfiguration ? { proxyConfiguration } : {}), launchContext: { launchOptions: { args: ["--disable-blink-features=AutomationControlled", "--lang=de-DE"], }, }, - preNavigationHooks: [ async ({ page }) => { - await page.setExtraHTTPHeaders({ - "Accept-Language": "de-DE,de;q=0.9", - }); - // EUR as primary currency β€” prices scraped verbatim from de.fs.com - await page.context().addCookies([ - { name: "currency", value: "EUR", domain: ".fs.com", path: "/" }, - { name: "lang", value: "de", domain: ".fs.com", path: "/" }, - { name: "country", value: "DE", domain: ".fs.com", path: "/" }, - ]); + await applyStealthPatches(page); + await page.setExtraHTTPHeaders({ "Accept-Language": "de-DE,de;q=0.9" }); + await page.context().addCookies(DE_COOKIES.map(c => ({ ...c, domain: "www.fs.com" }))); }, ], - async requestHandler({ page, request, log }) { - const url = request.url; - log.info(`Scraping: ${url}`); - - // Wait for Vue.js product grid to render - await page.waitForTimeout(4000); - - const productData = await page.evaluate(() => { - const results: Array<{ - name: string; - href: string; - price: string; - stock: string; - partNumber: string; - }> = []; - - // Strategy 1: Parse .category__grid__item cards (2026 Vue.js DOM) - const gridItems = document.querySelectorAll(".category__grid__item"); - for (const item of gridItems) { - const link = item.querySelector('a[href*="/products/"]') as HTMLAnchorElement | null; - const img = item.querySelector("img"); - const priceEl = item.querySelector(".grid__price"); - const allText = item.textContent || ""; - - if (!link) continue; - - const name = img?.getAttribute("alt")?.trim() || link.textContent?.trim() || ""; - const href = link.getAttribute("href") || ""; - const price = priceEl?.textContent?.trim() || ""; - - // Extract stock from text like "1914 in Global Warehouse" - const stockMatch = allText.match(/(\d+)\s+in\s+(?:Global\s+)?Warehouse/i); - const stock = stockMatch ? stockMatch[1] + " in stock" : ""; - - // Extract FS product ID from URL - const pnMatch = href.match(/products\/(\d+)\.html/); - const partNumber = pnMatch ? `FS-${pnMatch[1]}` : ""; - - if (name && href) { - results.push({ name, href, price, stock, partNumber }); - } - } - - // Strategy 2: Fallback β€” look for product links with prices nearby - if (results.length === 0) { - const productLinks = document.querySelectorAll( - 'a[href*="/products/"], a[href*="/product/"]' - ); - for (const link of productLinks) { - const el = link as HTMLAnchorElement; - const name = el.textContent?.trim() || ""; - const href = el.getAttribute("href") || ""; - if (!name || name.length < 5 || !href) continue; - - const container = el.closest('[class*="product"]') || el.closest('[class*="item"]') || el.closest("li") || el.parentElement?.parentElement; - let price = ""; - let stock = ""; - if (container) { - const priceEl = container.querySelector('[class*="price"]'); - price = priceEl?.textContent?.trim() || ""; - const stockEl = container.querySelector('[class*="stock"], [class*="avail"]'); - stock = stockEl?.textContent?.trim() || ""; - } - const pn = href.split("/").pop()?.replace(".html", "")?.replace(/\?.*/, "") || ""; - if (name) results.push({ name, href, price, stock, partNumber: pn }); - } - } - - return results; - }); - - for (const item of productData) { - if (!item.name || !item.price) continue; - - const { price, currency } = parsePrice(item.price); - const speedInfo = detectSpeed(item.name); - - if (price > 0) { - products.push({ - partNumber: item.partNumber || item.name.slice(0, 50), - name: item.name, - price, - currency, - stockLevel: item.stock ? parseStockLevel(item.stock) : "on_request", - quantity: item.stock ? parseQuantity(item.stock) : undefined, - url: item.href.startsWith("http") ? item.href : `${BASE_URL}${item.href}`, - formFactor: detectFormFactor(item.name), - speedGbps: speedInfo?.speedGbps, - speed: speedInfo?.speed, - reachLabel: detectReach(item.name), - }); - } + const { catPath, pageNum } = request.userData as { catPath: string; pageNum: number }; + if (exhausted.has(catPath)) { + log.debug(`[p1] Skip ${catPath} p${pageNum} β€” exhausted`); + return; } - pagesScraped++; - log.info(` Found ${productData.length} items on page`); + try { + await page.waitForSelector('a[href*="/products/"]', { timeout: 12000 }); + } catch { + await page.waitForTimeout(6000); + } + + const found = await page.evaluate( + (): Array<{ url: string; name: string; partNumber: string }> => { + const results: Array<{ url: string; name: string; partNumber: string }> = []; + const seen = new Set(); + for (const link of document.querySelectorAll('a[href*="/products/"]')) { + const href = link.getAttribute("href") ?? ""; + if (!href.includes("/products/")) continue; + const absUrl = href.startsWith("http") ? href : `https://www.fs.com${href}`; + if (seen.has(absUrl)) continue; + seen.add(absUrl); + const img = link.querySelector("img"); + const name = ( + img?.getAttribute("alt") ?? + link.getAttribute("title") ?? + link.textContent ?? + "" + ).trim().replace(/\s+/g, " ").slice(0, 200); + const pnMatch = href.match(/\/products\/(\d+)\.html/); + const partNumber = pnMatch ? `FS-${pnMatch[1]}` : ""; + if (name.length >= 5 && partNumber) results.push({ url: absUrl, name, partNumber }); + } + return results; + } + ); + + log.info(`[Listing] ${catPath} p${pageNum}: ${found.length} products`); + if (found.length === 0) { + exhausted.add(catPath); + } else { + for (const p of found) { + if (!products.has(p.url)) products.set(p.url, p); + } + } }, }); - const startUrls = CATEGORY_URLS.map((path) => `${BASE_URL}${path}`); - await crawler.run(startUrls); + await crawler.run(listingRequests); + console.log(`[Phase 1] ${products.size} unique products across ${CATEGORY_URLS.length} categories`); + return products; +} - console.log(`\nPages scraped: ${pagesScraped}`); - console.log(`Products found: ${products.length}`); +// ── Phase 2: Scrape product detail pages ────────────────────────────────────── - // Deduplicate by partNumber - const uniqueProducts = new Map(); - for (const p of products) { - const key = p.partNumber || p.name; - if (!uniqueProducts.has(key)) { - uniqueProducts.set(key, p); - } - } +async function scrapeProductDetails( + requests: Array<{ url: string; userData: { name: string; partNumber: string } }>, + proxyConfiguration: ProxyConfiguration | undefined +): Promise { + const details: ProductDetail[] = []; - // Write to database - let written = 0; - let skipped = 0; - - for (const p of uniqueProducts.values()) { - try { - const transceiverId = await findOrCreateScrapedTransceiver({ - partNumber: p.partNumber, - vendorId, - formFactor: p.formFactor, - speedGbps: p.speedGbps, - speed: p.speed, - reachLabel: p.reachLabel, - category: "DataCenter", - }); - - const hash = contentHash({ price: p.price, stock: p.stockLevel, qty: p.quantity }); - const isNew = await upsertPriceObservation({ - transceiverId, - sourceVendorId: vendorId, - price: p.price, - currency: p.currency, - stockLevel: p.stockLevel, - quantityAvailable: p.quantity, - url: p.url, - contentHash: hash, - }); - - if (isNew) written++; - else skipped++; - } catch (err) { - console.error(` Error: ${p.partNumber}:`, (err as Error).message); - } - } - - console.log(`\nDatabase: ${written} new, ${skipped} unchanged (${uniqueProducts.size} unique)`); - - // ═══ PHASE 2: Crawl product detail pages for VERIFIED specs ═══ - console.log("\n=== Phase 2: Scraping product detail pages for verified specs ==="); - - // Get products that need spec verification (enriched_estimated or missing key fields) - const needsSpecs = await pool.query( - `SELECT t.id, t.part_number, t.slug FROM transceivers t - JOIN vendors v ON t.vendor_id = v.id - WHERE v.slug = 'fs-com' - AND (t.data_confidence = 'enriched_estimated' OR t.data_confidence = 'unknown' - OR t.connector IS NULL OR t.connector = '' OR t.connector = '-' - OR t.wavelengths IS NULL OR t.wavelengths = '' - OR t.fiber_type IS NULL OR t.fiber_type = '') - LIMIT 200` - ); - console.log(`Products needing spec verification: ${needsSpecs.rows.length}`); - - // Build a map of product URLs from our scraped data - const productUrls = new Map(); // transceiver_id β†’ product URL - for (const p of uniqueProducts.values()) { - // Find the transceiver in DB by part number - const match = await pool.query( - `SELECT id FROM transceivers WHERE part_number = $1 AND vendor_id = $2`, - [p.partNumber, vendorId] - ).catch(() => ({ rows: [] })); - if (match.rows[0] && p.url) { - productUrls.set(match.rows[0].id, p.url); - } - } - - let specsUpdated = 0; - const specCrawler = new PlaywrightCrawler({ + const crawler = new PlaywrightCrawler({ maxConcurrency: 1, maxRequestsPerMinute: 10, - requestHandlerTimeoutSecs: 45, + requestHandlerTimeoutSecs: 90, headless: true, + useSessionPool: false, ...(proxyConfiguration ? { proxyConfiguration } : {}), launchContext: { launchOptions: { - args: ["--disable-blink-features=AutomationControlled", "--lang=en-US"], + args: ["--disable-blink-features=AutomationControlled", "--lang=de-DE"], }, }, preNavigationHooks: [ async ({ page }) => { - await page.context().addCookies([ - { name: "currency", value: "USD", domain: ".fs.com", path: "/" }, - { name: "lang", value: "en", domain: ".fs.com", path: "/" }, - ]); + await applyStealthPatches(page); + await page.setExtraHTTPHeaders({ "Accept-Language": "de-DE,de;q=0.9" }); + await page.context().addCookies(DE_COOKIES.map(c => ({ ...c, domain: "www.fs.com" }))); }, ], async requestHandler({ page, request, log }) { - const transceiverId = request.userData?.transceiverId; - if (!transceiverId) return; + const { name: listingName, partNumber: listingPn } = request.userData as { + name: string; + partNumber: string; + }; + const url = request.url; - log.info(`Spec scrape: ${request.url}`); - await page.waitForTimeout(3000); - - // Extract spec table from product detail page - const specData = await page.evaluate(() => { - const specs: Record = {}; - // fs.com uses various spec table formats - const rows = document.querySelectorAll( - ".product-param tr, .product-specs tr, table.param-table tr, " + - ".specifications tr, .detail-param tr, .prod-spec-list tr, " + - '[class*="specification"] tr, [class*="param"] tr' + try { + await page.waitForSelector( + 'h1, .product-detail, [class*="product-info"], [class*="product-main"]', + { timeout: 12000 } ); - for (const row of rows) { - const cells = row.querySelectorAll("td, th"); - if (cells.length >= 2) { - const key = (cells[0]?.textContent || "").trim(); - const val = (cells[1]?.textContent || "").trim(); - if (key && val && key.length < 100) specs[key] = val; + } catch { + await page.waitForTimeout(7000); + } + + const raw = await page.evaluate( + (): { + bodyText: string; + specs: Record; + brands: string[]; + imageUrl: string; + datasheetUrl: string; + h1: string; + } => { + const bodyText = (document.body?.innerText ?? "").replace(/\n{3,}/g, "\n\n"); + + const specs: Record = {}; + const SEL = [ + ".product-param tr", ".product-specs tr", ".param-table tr", + ".specifications tr", ".detail-param tr", ".prod-spec-list tr", + '[class*="specification"] tr', '[class*="param"] tr', + ".tab-content tr", ".product-info-table tr", ".tech-param tr", + ].join(", "); + document.querySelectorAll(SEL).forEach((row) => { + const cells = row.querySelectorAll("td, th"); + if (cells.length >= 2) { + const k = (cells[0]?.textContent ?? "").trim().replace(/\s+/g, " "); + const v = (cells[1]?.textContent ?? "").trim().replace(/\s+/g, " "); + if (k && v && k.length < 80 && !/^[-\s]+$/.test(k)) specs[k] = v; + } + }); + document.querySelectorAll("dt").forEach((dt) => { + const dd = dt.nextElementSibling; + if (dd?.tagName === "DD") { + const k = (dt.textContent ?? "").trim(); + const v = (dd.textContent ?? "").trim(); + if (k && v && k.length < 80) specs[k] = v; + } + }); + + const brands: string[] = []; + const brandContainer = document.querySelector( + '[class*="compatible"], [class*="brand-list"], [class*="compatibility"], ' + + '[class*="apply-brand"], [id*="brand"], [id*="compatible"]' + ); + if (brandContainer) { + brandContainer.querySelectorAll("button, a, span, li").forEach((el) => { + const t = (el.textContent ?? "").trim(); + if (t && t.length > 1 && t.length < 50 && !/^\d+$/.test(t)) brands.push(t); + }); } - } - - // Also try dl/dt/dd pattern - const dts = document.querySelectorAll("dt, .spec-label, .param-label"); - for (const dt of dts) { - const dd = dt.nextElementSibling; - if (dd && (dd.tagName === "DD" || dd.classList.contains("spec-value") || dd.classList.contains("param-value"))) { - const key = (dt.textContent || "").trim(); - const val = (dd.textContent || "").trim(); - if (key && val) specs[key] = val; + if (brands.length === 0) { + const bodyTxt = document.body?.innerText ?? ""; + const m = bodyTxt.match(/[Kk]ompatibel\s+mit[^:]*:\s*([\s\S]{0,600})/); + if (m) { + m[1].split(/[,;\n]/).forEach((s) => { + const b = s.trim(); + if (b.length > 1 && b.length < 50 && !/^\d/.test(b)) brands.push(b); + }); + } } + + const imgEl = document.querySelector( + ".product-image img, .prod-img img, .product-gallery img, " + + '[class*="main-image"] img, [class*="primary-image"] img, ' + + ".slick-current img, .product__image img" + ); + const imageUrl = imgEl?.src ?? imgEl?.getAttribute("data-src") ?? ""; + + const dsEl = document.querySelector( + 'a[href*="datasheet"], a[href*=".pdf"], a[download][href*=".pdf"]' + ); + const datasheetUrl = dsEl?.href ?? dsEl?.getAttribute("href") ?? ""; + + const h1 = document.querySelector("h1")?.textContent?.trim() ?? ""; + return { bodyText, specs, brands, imageUrl, datasheetUrl, h1 }; } + ); - // Extract image - const img = document.querySelector('.product-image img, .prod-img img, [class*="gallery"] img, .product-detail img'); - const imageUrl = img?.getAttribute("src") || ""; + if (!raw.bodyText) { log.warning(`No text: ${url}`); return; } + const t = raw.bodyText; - // Extract datasheet link - const dsLink = document.querySelector('a[href*="datasheet"], a[href*=".pdf"]'); - const datasheetUrl = dsLink?.getAttribute("href") || ""; + // ── Net price (ohne MwSt, EUR) ───────────────────────────────────────── + let priceNet: number | undefined; + for (const pat of [ + /([0-9]{1,5}[,.]?[0-9]{0,3})\s*€\s*\(?ohne\s+MwSt\.?\)?/i, + /€\s*([0-9]{1,5}[,.]?[0-9]{0,3})\s*\(?ohne\s+MwSt\.?\)?/i, + /([0-9]{1,5}[,.]?[0-9]{0,3})\s*€\s*netto/i, + /Netto[:\s]*€?\s*([0-9]{1,5}[,.]?[0-9]{0,3})/i, + /([0-9]{1,5},[0-9]{2})\s*€/, + /€\s*([0-9]{1,5},[0-9]{2})/, + ]) { + const m = t.match(pat); + if (m?.[1]) { + const p = parseGermanPrice(m[1]); + if (p && p > 0.5 && p < 500_000) { priceNet = p; break; } + } + } - return { specs, imageUrl, datasheetUrl }; + // ── DE-Lager ─────────────────────────────────────────────────────────── + let deQty: number | undefined; + let deDeliveryDate: string | undefined; + const deM = + t.match(/(\d[\d.,KkMm]*)\s*Stk\.\s*im\s*DE[- ]Lager/i) ?? + t.match(/(\d[\d.,KkMm]*)\s*im\s*DE[- ]?Lager/i); + if (deM?.[1]) { + deQty = parseGermanQty(deM[1]); + const idx = t.indexOf(deM[0]); + const ctx = t.slice(idx, idx + 300); + const dm = + ctx.match(/Lieferung[:\s]+([0-9]{1,2}\.?\s+\w+\.?,?\s*\d{4})/i) ?? + ctx.match(/erwartet[:\s]+([0-9]{1,2}\.?\s+\w+\.?,?\s*\d{4})/i) ?? + ctx.match(/([0-9]{1,2}\.[0-9]{1,2}\.[0-9]{4})/); + if (dm?.[1]) deDeliveryDate = parseGermanDate(dm[1]); + } + + // ── Global-Lager ─────────────────────────────────────────────────────── + let globalQty: number | undefined; + let globalDeliveryDate: string | undefined; + const glM = + t.match(/(\d[\d.,KkMm]*)\s*Stk\.\s*im\s*Global[- ]Lager/i) ?? + t.match(/(\d[\d.,KkMm]*)\s*im\s*Global[- ]?(?:Lager|Warehouse)/i) ?? + t.match(/(\d[\d.,KkMm]*)\s*in\s+Global\s+Warehouse/i); + if (glM?.[1]) { + globalQty = parseGermanQty(glM[1]); + const idx = t.indexOf(glM[0]); + const ctx = t.slice(idx, idx + 300); + const dm = + ctx.match(/Lieferung[:\s]+([0-9]{1,2}\.?\s+\w+\.?,?\s*\d{4})/i) ?? + ctx.match(/erwartet[:\s]+([0-9]{1,2}\.?\s+\w+\.?,?\s*\d{4})/i) ?? + ctx.match(/([0-9]{1,2}\.[0-9]{1,2}\.[0-9]{4})/); + if (dm?.[1]) globalDeliveryDate = parseGermanDate(dm[1]); + } + + // ── Nachlieferung ────────────────────────────────────────────────────── + let backorderQty: number | undefined; + let backorderDate: string | undefined; + const boM = + t.match(/(\d[\d.,KkMm]*)\s*(?:Stk\.)?\s*in\s+Nachlieferung/i) ?? + t.match(/Nachlieferung[:\s]*(\d[\d.,KkMm]*)/i); + if (boM?.[1]) { + backorderQty = parseGermanQty(boM[1]); + const idx = t.indexOf(boM[0]); + const ctx = t.slice(idx, idx + 300); + const dm = + ctx.match(/[Ee]rwartet[:\s]+([0-9]{1,2}\.?\s+\w+\.?,?\s*\d{4})/i) ?? + ctx.match(/Lieferung[:\s]+([0-9]{1,2}\.?\s+\w+\.?,?\s*\d{4})/i) ?? + ctx.match(/([0-9]{1,2}\.[0-9]{1,2}\.[0-9]{4})/); + if (dm?.[1]) backorderDate = parseGermanDate(dm[1]); + } + + // ── Units sold ───────────────────────────────────────────────────────── + let unitsSold: number | undefined; + const soldM = + t.match(/(\d[\d.,KkMm]*)\s*(?:[Mm]al\s+)?[Vv]erkauft/) ?? + t.match(/([\d.,KkMm]+)\+?\s*sold/i); + if (soldM?.[1]) unitsSold = parseGermanQty(soldM[1]); + + // ── Part number refinement ───────────────────────────────────────────── + let partNumber = listingPn; + const pnM = t.match( + /(?:Part\s+Number|Teilenummer|Artikelnummer|P\/N)[:\s]+([A-Z0-9][-A-Z0-9./]{3,40})/i + ); + if (pnM?.[1]) partNumber = pnM[1].trim(); + + const resolveUrl = (u: string): string | undefined => { + if (!u) return undefined; + if (u.startsWith("//")) return `https:${u}`; + if (u.startsWith("/")) return `${BASE_URL}${u}`; + if (u.startsWith("http")) return u; + return undefined; + }; + + const compatibleBrands = [...new Set(raw.brands)].filter((b) => b.length > 1).slice(0, 30); + + log.info( + `${partNumber}: €${priceNet?.toFixed(2) ?? "?"} | ` + + `DE=${deQty ?? "-"} GL=${globalQty ?? "-"} BO=${backorderQty ?? "-"} ` + + `sold=${unitsSold ?? "-"} brands=${compatibleBrands.length}` + ); + + details.push({ + url, + name: raw.h1 || listingName, + partNumber, + priceNet, + deQty, + deDeliveryDate, + globalQty, + globalDeliveryDate, + backorderQty, + backorderDate, + unitsSold, + compatibleBrands, + specs: raw.specs, + imageUrl: resolveUrl(raw.imageUrl), + datasheetUrl: resolveUrl(raw.datasheetUrl), + }); + }, + }); + + await crawler.run(requests); + return details; +} + +// ── Main export ──────────────────────────────────────────────────────────────── + +export async function scrapeFs(): Promise { + console.log("=== FS.com Scraper v2 Starting ===\n"); + + // ── Quick connectivity check β€” exit early on datacenter IPs that block FS.com ─ + try { + const probe = await fetch("https://www.fs.com/robots.txt", { + signal: AbortSignal.timeout(8000), + headers: { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" }, + }); + if (!probe.ok && probe.status !== 403) throw new Error(`HTTP ${probe.status}`); + if (probe.status === 403) { + const body = (await probe.text()).slice(0, 200); + if (body.includes("Request blocked") || body.includes("ERROR")) { + console.warn("[FS.com] Server IP is blocked by FS.com β€” skipping. Run via run-fs-scraper-mac.sh on a residential IP."); + return; + } + } + } catch (err) { + const msg = (err as Error).message; + if (msg.includes("ENOTFOUND") || msg.includes("ECONNREFUSED") || msg.includes("ERR_EMPTY") || msg.includes("TimeoutError")) { + console.warn(`[FS.com] Connectivity check failed (${msg.slice(0, 60)}) β€” skipping. This scraper requires a residential IP.`); + return; + } + // For other errors (e.g. 200 OK but weird body), proceed anyway + } + + const proxyConfiguration = buildProxyConfiguration(); + + const vendorId = await ensureVendor( + "FS.COM", + "compatible", + "https://www.fs.com", + "https://www.fs.com/de/c/optical-transceivers-9" + ); + console.log(`Vendor ID: ${vendorId}`); + + // ── Phase 1: Discover product URLs ───────────────────────────────────────── + console.log("\n[Phase 1] Collecting product URLs from category listing pages…"); + const productMap = await collectProductUrls(proxyConfiguration); + + if (productMap.size === 0) { + console.warn("[Phase 1] No products discovered β€” check selectors or proxy."); + return; + } + + // ── Filter: skip products with fresh stock data ───────────────────────────── + const allPartNumbers = [...productMap.values()].map((p) => p.partNumber).filter(Boolean); + const freshlyScraped = new Set(); + if (allPartNumbers.length > 0) { + const freshResult = await pool.query( + `SELECT DISTINCT t.part_number + FROM transceivers t + JOIN stock_observations so ON so.transceiver_id = t.id + WHERE so.source_vendor_id = $1 + AND so.time > NOW() - INTERVAL '${STOCK_FRESH_HOURS} hours' + AND t.part_number = ANY($2)`, + [vendorId, allPartNumbers] + ); + for (const row of freshResult.rows) freshlyScraped.add(row.part_number as string); + } + + const urlsToScrape = [...productMap.keys()] + .filter((url) => !freshlyScraped.has(productMap.get(url)?.partNumber ?? "")) + .slice(0, MAX_DETAIL_PAGES_PER_RUN); + + console.log(`\n[Phase 2] Scraping ${urlsToScrape.length} detail pages`); + console.log(` (${productMap.size - urlsToScrape.length} skipped β€” data ≀${STOCK_FRESH_HOURS}h fresh)`); + + if (urlsToScrape.length === 0) { + console.log("[Phase 2] All products have fresh stock data β€” nothing to scrape."); + return; + } + + // ── Phase 2: Scrape detail pages ──────────────────────────────────────────── + const detailRequests = urlsToScrape.map((url) => { + const s = productMap.get(url); + return { url, userData: { name: s?.name ?? "FS.com Product", partNumber: s?.partNumber ?? "" } }; + }); + + const details = await scrapeProductDetails(detailRequests, proxyConfiguration); + console.log(`[Phase 2] Complete β€” ${details.length} pages scraped.`); + + // ── Phase 3: Write to database ───────────────────────────────────────────── + console.log("\n[Phase 3] Writing to database…"); + let priceWritten = 0; + let stockWritten = 0; + let specsUpdated = 0; + let errors = 0; + + for (const detail of details) { + try { + const ff = detectFormFactor(detail.name); + const speedInfo = detectSpeed(detail.name); + const reach = detectReach(detail.name); + const parsed = parseSpecTable(detail.specs); + + const transceiverId = await findOrCreateScrapedTransceiver({ + partNumber: detail.partNumber, + vendorId, + formFactor: ff, + speedGbps: speedInfo?.speedGbps, + speed: speedInfo?.speed, + reachLabel: reach ?? parsed.reachLabel, + reachMeters: parsed.reachMeters, + fiberType: parsed.fiberType, + wavelengths: parsed.wavelengths, + imageUrl: detail.imageUrl, + category: "DataCenter", }); - if (Object.keys(specData.specs).length > 0) { - const parsed = parseSpecTable(specData.specs); + const stockLevel = deriveStockLevel(detail.deQty, detail.globalQty, detail.backorderQty); + const totalQty = (detail.deQty ?? 0) + (detail.globalQty ?? 0); + + if (detail.priceNet && detail.priceNet > 0) { + const hash = contentHash({ + p: detail.priceNet, + de: detail.deQty ?? 0, + gl: detail.globalQty ?? 0, + }); + const isNew = await upsertPriceObservation({ + transceiverId, + sourceVendorId: vendorId, + price: detail.priceNet, + currency: "EUR", + stockLevel, + quantityAvailable: totalQty > 0 ? totalQty : undefined, + url: detail.url, + contentHash: hash, + }); + if (isNew) priceWritten++; + } + + const stockNew = await upsertStockObservation({ + transceiverId, + sourceVendorId: vendorId, + stockLevel, + quantityAvailable: totalQty > 0 ? totalQty : undefined, + warehouseDeQty: detail.deQty, + warehouseDeDeliveryDate: detail.deDeliveryDate ?? null, + warehouseGlobalQty: detail.globalQty, + warehouseGlobalDeliveryDate: detail.globalDeliveryDate ?? null, + backorderQty: detail.backorderQty, + backorderEstimatedDate: detail.backorderDate ?? null, + unitsSold: detail.unitsSold, + compatibleBrands: detail.compatibleBrands, + priceNet: detail.priceNet, + productUrl: detail.url, + }); + if (stockNew) stockWritten++; + + if (Object.keys(detail.specs).length > 0) { const updated = await updateVerifiedSpecs({ transceiverId, fiberType: parsed.fiberType, @@ -397,33 +760,25 @@ export async function scrapeFs(): Promise { tempRange: parsed.tempRange, modulation: parsed.modulation, domSupport: parsed.domSupport, - imageUrl: specData.imageUrl ? (specData.imageUrl.startsWith("http") ? specData.imageUrl : `${BASE_URL}${specData.imageUrl}`) : undefined, - datasheetUrl: specData.datasheetUrl ? (specData.datasheetUrl.startsWith("http") ? specData.datasheetUrl : `${BASE_URL}${specData.datasheetUrl}`) : undefined, + imageUrl: detail.imageUrl, + datasheetUrl: detail.datasheetUrl, source: "fs.com", }); if (updated) specsUpdated++; } - }, - }); - - // Build spec crawl requests (limit to 200 per run to avoid rate limiting) - const specRequests = needsSpecs.rows - .filter(r => productUrls.has(r.id)) - .slice(0, 200) - .map(r => ({ - url: productUrls.get(r.id)!, - userData: { transceiverId: r.id }, - })); - - if (specRequests.length > 0) { - console.log(`Crawling ${specRequests.length} product detail pages for specs...`); - await specCrawler.run(specRequests); - console.log(`Specs verified: ${specsUpdated} products updated`); - } else { - console.log("No product URLs available for spec verification this run"); + } catch (err) { + console.error(` βœ— ${detail.partNumber}: ${(err as Error).message}`); + errors++; + } } - console.log("=== FS.com Scraper Complete ===\n"); + console.log("\n=== FS.com Scraper v2 Complete ==="); + console.log(` Products discovered: ${productMap.size}`); + console.log(` Detail pages scraped: ${details.length}`); + console.log(` Price observations: ${priceWritten} new`); + console.log(` Stock observations: ${stockWritten} new`); + console.log(` Specs verified: ${specsUpdated}`); + if (errors > 0) console.warn(` DB errors: ${errors}`); } if (require.main === module) { diff --git a/packages/scraper/src/scrapers/smartoptics.ts b/packages/scraper/src/scrapers/smartoptics.ts index 2e9712d..5099b06 100644 --- a/packages/scraper/src/scrapers/smartoptics.ts +++ b/packages/scraper/src/scrapers/smartoptics.ts @@ -1,54 +1,65 @@ /** * SmartOptics Scraper β€” Premium coherent/DWDM transceiver manufacturer * - * smartoptics.com β€” WordPress site, no prices (B2B, RFQ model). - * Scrapes product catalog for specs, images, datasheets. - * Products listed at /products/optical-transceivers/ β†’ individual /product/SKU/ pages. + * smartoptics.com β€” WordPress/WooCommerce, no prices (B2B, RFQ only). + * Scrapes product catalog for specs, images, and datasheets. + * + * v2 fixes: + * - Multi-category crawl (coherent, DWDM, access, SFP, QSFP) + * - Handles both absolute AND relative product URLs + * - WooCommerce REST API fallback for complete product list + * - Up to 10 pagination pages per category */ import { pool, findOrCreateScrapedTransceiver, ensureVendor } from "../utils/db"; const BASE = "https://smartoptics.com"; -const CATALOG_URL = `${BASE}/products/optical-transceivers/`; const HEADERS = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", - Accept: "text/html,application/xhtml+xml", + "User-Agent": + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", + Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.9", }; +/** All transceiver-related catalog category pages to crawl */ +const CATALOG_PAGES = [ + "/products/optical-transceivers/", + "/products/", + "/product-category/optical-transceivers/", + "/product-category/transceivers/", + "/product-category/sfp/", + "/product-category/qsfp/", + "/product-category/coherent/", + "/product-category/dwdm/", +]; + function sleep(ms: number): Promise { return new Promise((r) => setTimeout(r, ms)); } function detectFormFactor(text: string): { formFactor: string; speed: string; speedGbps: number } { const t = text.toLowerCase(); - if (t.includes("qsfp-dd800") || t.includes("sfp-dd800") || t.includes("800ge")) return { formFactor: "QSFP-DD", speed: "800G", speedGbps: 800 }; - if (t.includes("qsfp-dd") || (t.includes("400g") && t.includes("qsfp"))) return { formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 }; - if (t.includes("qsfp112")) return { formFactor: "QSFP112", speed: "400G", speedGbps: 400 }; - if (t.includes("qsfp56")) return { formFactor: "QSFP56", speed: "200G", speedGbps: 200 }; - if (t.includes("qsfp28") || t.includes("100ge") || t.includes("100g")) return { formFactor: "QSFP28", speed: "100G", speedGbps: 100 }; - if (t.includes("sfp28") || t.includes("25g")) return { formFactor: "SFP28", speed: "25G", speedGbps: 25 }; - if (t.includes("qsfp+") || t.includes("40g")) return { formFactor: "QSFP+", speed: "40G", speedGbps: 40 }; - if (t.includes("sfp+") || t.includes("10g")) return { formFactor: "SFP+", speed: "10G", speedGbps: 10 }; - if (t.includes("sfp")) return { formFactor: "SFP", speed: "1G", speedGbps: 1 }; + if (t.includes("qsfp-dd800") || t.includes("800ge")) return { formFactor: "QSFP-DD", speed: "800G", speedGbps: 800 }; + if (t.includes("qsfp-dd") || (t.includes("400g") && t.includes("qsfp"))) return { formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 }; + if (t.includes("qsfp112")) return { formFactor: "QSFP112", speed: "400G", speedGbps: 400 }; + if (t.includes("qsfp56")) return { formFactor: "QSFP56", speed: "200G", speedGbps: 200 }; + if (t.includes("qsfp28") || t.includes("100ge") || t.includes("100g")) return { formFactor: "QSFP28", speed: "100G", speedGbps: 100 }; + if (t.includes("sfp28") || t.includes("25g")) return { formFactor: "SFP28", speed: "25G", speedGbps: 25 }; + if (t.includes("qsfp+") || t.includes("40g")) return { formFactor: "QSFP+", speed: "40G", speedGbps: 40 }; + if (t.includes("sfp+") || t.includes("10g")) return { formFactor: "SFP+", speed: "10G", speedGbps: 10 }; + if (t.includes("sfp")) return { formFactor: "SFP", speed: "1G", speedGbps: 1 }; return { formFactor: "QSFP28", speed: "100G", speedGbps: 100 }; } function detectReach(text: string): { label: string; meters: number } | undefined { - const kmMatch = text.match(/(\d+)\s*km/i); - if (kmMatch) { - const km = parseInt(kmMatch[1]); - return { label: `${km}km`, meters: km * 1000 }; - } + const kmMatch = text.match(/(\d+(?:\.\d+)?)\s*km/i); + if (kmMatch) { const km = parseFloat(kmMatch[1]); return { label: `${km}km`, meters: km * 1000 }; } const mMatch = text.match(/(\d+)\s*m\b/i); - if (mMatch) { - const m = parseInt(mMatch[1]); - return { label: `${m}m`, meters: m }; - } + if (mMatch) { const m = parseInt(mMatch[1]); return { label: `${m}m`, meters: m }; } return undefined; } function detectFiber(text: string): string { - if (/dwdm|cwdm|coherent|coh|single.?mode|smf/i.test(text)) return "SMF"; - if (/multi.?mode|mmf|sr/i.test(text)) return "MMF"; + if (/multi.?mode|mmf|sr\b/i.test(text)) return "MMF"; return "SMF"; // SmartOptics is almost exclusively SMF/coherent } @@ -58,15 +69,52 @@ async function fetchPage(url: string): Promise { return resp.text(); } -function extractProductUrls(html: string): string[] { +/** + * Extract all /product/xxx/ URLs from an HTML page. + * Handles both absolute (https://smartoptics.com/product/...) and + * root-relative (/product/...) href patterns. + */ +function extractProductUrls(html: string, pageUrl: string): string[] { const urls = new Set(); - const regex = /href="(https?:\/\/smartoptics\.com\/product\/[^"]+)"/gi; + + // Absolute URLs + const absRegex = /href="(https?:\/\/(?:www\.)?smartoptics\.com\/product\/[^"#?]+)"/gi; let m: RegExpExecArray | null; - while ((m = regex.exec(html)) !== null) { - const u = m[1].replace(/\/$/, "") + "/"; - urls.add(u); + while ((m = absRegex.exec(html)) !== null) { + urls.add(normalizeProductUrl(m[1])); + } + + // Root-relative: href="/product/..." or href="/products/..." (individual product, not category) + const relRegex = /href="(\/product\/[^"#?]+)"/gi; + while ((m = relRegex.exec(html)) !== null) { + urls.add(normalizeProductUrl(`${BASE}${m[1]}`)); + } + + // WooCommerce data attributes: data-permalink or data-product-url + const dataRegex = /data-(?:permalink|product-url)="([^"]*\/product\/[^"]+)"/gi; + while ((m = dataRegex.exec(html)) !== null) { + const u = m[1].startsWith("http") ? m[1] : `${BASE}${m[1]}`; + urls.add(normalizeProductUrl(u)); + } + + // Filter out category pages β€” only keep individual product URLs + return Array.from(urls).filter((u) => { + const path = new URL(u).pathname; + // Must be /product/something β€” not /products/ (that's a category) + return path.startsWith("/product/") && path.split("/").filter(Boolean).length >= 2; + }); +} + +function normalizeProductUrl(url: string): string { + // Ensure trailing slash, strip query and fragment + try { + const u = new URL(url); + let path = u.pathname; + if (!path.endsWith("/")) path += "/"; + return `${u.origin}${path}`; + } catch { + return url; } - return Array.from(urls); } interface ProductData { @@ -74,6 +122,7 @@ interface ProductData { name: string; url: string; imageUrl?: string; + datasheetUrl?: string; formFactor: string; speed: string; speedGbps: number; @@ -88,23 +137,51 @@ async function scrapeProductPage(url: string): Promise { try { const html = await fetchPage(url); - const nameMatch = html.match(/]*>([^<]+)<\/h1>/) || html.match(/og:title" content="([^"]+)"/); - const name = nameMatch ? nameMatch[1].trim().replace(/ \| Smartoptics$/, "") : ""; - if (!name) return null; + // Product name β€” try OG tag first (most reliable), then H1 + const nameMatch = + html.match(/property="og:title"\s+content="([^"]+)"/) || + html.match(/content="([^"]+)"\s+property="og:title"/) || + html.match(/]*class="[^"]*(?:product_title|entry-title)[^"]*"[^>]*>([^<]+)<\/h1>/i) || + html.match(/]*>([^<]+)<\/h1>/); + const rawName = nameMatch?.[1]?.trim() ?? ""; + const name = rawName.replace(/\s*\|\s*Smartoptics\s*$/, "").replace(/\s*–\s*Smartoptics\s*$/, "").trim(); + if (!name || name.length < 4) return null; - const sku = url.split("/").filter(Boolean).pop()?.toUpperCase() || name.replace(/\s+/g, "-"); + // SKU β€” try WooCommerce SKU field first + const skuMatch = + html.match(/(?:SKU|Artikelnummer)[^<]*<\/[^>]+>\s*<[^>]+>([A-Z0-9][-A-Z0-9./]{2,40})/i) || + html.match(/"sku"\s*:\s*"([^"]+)"/) || + html.match(/class="sku"[^>]*>([^<]+)]+src="([^"]*wp-content\/uploads[^"]*\.(?:png|jpg|webp))"[^>]* class="[^"]*product/i); - const imageUrl = imgMatch ? imgMatch[1] : undefined; + // Product image + const imgMatch = + html.match(/property="og:image"\s+content="([^"]+)"/) || + html.match(/content="([^"]+)"\s+property="og:image"/) || + html.match(/]+src="([^"]*wp-content\/uploads[^"]*\.(?:png|jpg|webp))"[^>]+class="[^"]*(?:wp-post-image|attachment-shop_single)[^"]*"/i); + const imageUrl = imgMatch?.[1]; + + // Datasheet PDF link + const dsMatch = html.match(/href="([^"]*\.pdf)"[^>]*>.*?(?:datasheet|datenblatt|spec)/gi); + const datasheetUrl = dsMatch + ? (dsMatch[0].match(/href="([^"]+)"/) ?? [])[1] + : undefined; const ff = detectFormFactor(name); const reach = detectReach(name); - const coherent = /coherent|coh-t|coh\.|dwdm|dp-qpsk|qpsk|cfp2/i.test(name + html.slice(0, 3000)); + const pageText = html.slice(0, 5000); // only check first 5KB for coherent detection + const coherent = /coherent|coh-t|coh\.|dp-qpsk|qpsk|cfp2/i.test(name + pageText); const wdmType = /dwdm/i.test(name) ? "DWDM" : /cwdm/i.test(name) ? "CWDM" : undefined; return { - sku, name, url, imageUrl, + sku, + name, + url, + imageUrl, + datasheetUrl, ...ff, reachLabel: reach?.label, reachMeters: reach?.meters, @@ -113,14 +190,40 @@ async function scrapeProductPage(url: string): Promise { wdmType, }; } catch (err) { - console.warn(` Failed ${url}: ${(err as Error).message}`); + console.warn(` Failed ${url}: ${(err as Error).message.slice(0, 80)}`); return null; } } +/** Try WooCommerce REST API for a complete product list (often publicly accessible) */ +async function tryWooCommerceApi(): Promise { + const urls: string[] = []; + try { + for (let page = 1; page <= 20; page++) { + const apiUrl = `${BASE}/wp-json/wc/v3/products?per_page=100&page=${page}&category=optical-transceivers&status=publish`; + const resp = await fetch(apiUrl, { + headers: { ...HEADERS, Accept: "application/json" }, + signal: AbortSignal.timeout(10000), + }); + if (!resp.ok) break; + const products = await resp.json() as Array<{ permalink?: string; slug?: string }>; + if (!Array.isArray(products) || products.length === 0) break; + for (const p of products) { + if (p.permalink) urls.push(normalizeProductUrl(p.permalink)); + else if (p.slug) urls.push(normalizeProductUrl(`${BASE}/product/${p.slug}/`)); + } + if (products.length < 100) break; + await sleep(500); + } + } catch { + // API not accessible β€” not unusual, fall through to HTML crawl + } + return urls; +} + export async function scrapeSmartOptics(): Promise { - console.log("=== SmartOptics Scraper Starting ===\n"); - console.log("Note: SmartOptics is B2B β€” no public prices. Scraping specs + images only.\n"); + console.log("=== SmartOptics Scraper v2 Starting ===\n"); + console.log("Note: SmartOptics is B2B β€” no public prices. Scraping specs + catalog only.\n"); const vendorId = await ensureVendor( "SmartOptics", @@ -130,32 +233,54 @@ export async function scrapeSmartOptics(): Promise { ); const productUrls = new Set(); - for (let page = 1; page <= 10; page++) { - try { - const url = page === 1 ? CATALOG_URL : `${CATALOG_URL}page/${page}/`; - const html = await fetchPage(url); - const urls = extractProductUrls(html); - if (urls.length === 0) break; - urls.forEach((u) => productUrls.add(u)); - console.log(` Catalog page ${page}: ${urls.length} products`); - await sleep(1500); - } catch { - break; + + // ── Try WooCommerce REST API first (fastest, most complete) ────────────── + console.log("[1] Trying WooCommerce REST API…"); + const apiUrls = await tryWooCommerceApi(); + if (apiUrls.length > 0) { + console.log(` API returned ${apiUrls.length} products`); + apiUrls.forEach((u) => productUrls.add(u)); + } else { + console.log(" API not accessible β€” falling back to HTML crawl"); + } + + // ── HTML catalog crawl (always run to catch any API misses) ─────────────── + console.log("[2] Crawling category pages…"); + for (const catPath of CATALOG_PAGES) { + const catBase = `${BASE}${catPath}`; + for (let page = 1; page <= 10; page++) { + const pageUrl = page === 1 ? catBase : `${catBase}page/${page}/`; + try { + const html = await fetchPage(pageUrl); + const found = extractProductUrls(html, pageUrl); + if (found.length === 0 && page > 1) break; // no more pages in this category + if (found.length === 0 && page === 1) break; // category doesn't exist + found.forEach((u) => productUrls.add(u)); + console.log(` ${catPath} p${page}: ${found.length} products`); + await sleep(1200); + } catch (err) { + const msg = (err as Error).message; + if (!msg.includes("404")) console.warn(` ${pageUrl}: ${msg.slice(0, 60)}`); + break; + } } } - console.log(`\nTotal product URLs: ${productUrls.size}`); + console.log(`\nTotal unique product URLs: ${productUrls.size}`); if (productUrls.size === 0) { - console.log("No products found β€” site may have changed structure"); + console.warn("No products found β€” SmartOptics site structure may have changed"); return; } + // ── Scrape individual product pages ─────────────────────────────────────── + console.log("\n[3] Scraping product detail pages…"); let saved = 0; let withImages = 0; + let failed = 0; for (const url of productUrls) { const product = await scrapeProductPage(url); - if (!product) continue; + if (!product) { failed++; continue; } try { await findOrCreateScrapedTransceiver({ @@ -173,14 +298,18 @@ export async function scrapeSmartOptics(): Promise { }); saved++; if (product.imageUrl) withImages++; - console.log(` βœ“ ${product.sku} β€” ${product.name.slice(0, 60)}`); + console.log(` βœ“ ${product.sku.slice(0, 25).padEnd(25)} ${product.name.slice(0, 50)}`); } catch (err) { - console.warn(` Error saving ${product.sku}: ${(err as Error).message.slice(0, 80)}`); + console.warn(` βœ— ${product.sku}: ${(err as Error).message.slice(0, 80)}`); } - await sleep(1500); + await sleep(1200); } - console.log(`\n=== SmartOptics Complete: ${saved} products, ${withImages} with images ===`); + console.log(`\n=== SmartOptics v2 Complete ===`); + console.log(` Products discovered: ${productUrls.size}`); + console.log(` Saved to DB: ${saved}`); + console.log(` With images: ${withImages}`); + if (failed > 0) console.warn(` Failed pages: ${failed}`); } if (require.main === module) { diff --git a/packages/scraper/src/utils/db.ts b/packages/scraper/src/utils/db.ts index 5da175e..fe1cbee 100644 --- a/packages/scraper/src/utils/db.ts +++ b/packages/scraper/src/utils/db.ts @@ -18,6 +18,28 @@ export const pool = new Pool({ // Alias β€” some scrapers import { db } instead of { pool } export const db = pool; +/** + * After any verified flag is set, check if all 4 criteria are met and promote + * the transceiver to fully_verified. Call this wherever price/image/details/ + * competitor_verified are written so the counter stays consistent. + */ +export async function checkAndSetFullyVerified(transceiverId: string): Promise { + const result = await pool.query( + `UPDATE transceivers + SET fully_verified = true, + fully_verified_at = COALESCE(fully_verified_at, NOW()) + WHERE id = $1 + AND price_verified = true + AND image_verified = true + AND details_verified = true + AND competitor_verified = true + AND (fully_verified IS NULL OR fully_verified = false) + RETURNING id`, + [transceiverId] + ); + return (result.rowCount ?? 0) > 0; +} + // Per-form-factor price bounds [min, max] in USD equivalent const PRICE_BOUNDS: Record = { "SFP": [2, 3000], @@ -67,6 +89,12 @@ export async function upsertPriceObservation(params: { : params.currency === "GBP" ? params.price * 1.27 : params.price; + // Hard floor: no transceiver of any type can cost less than $1.50 β€” catches accessories/cables + // misidentified as transceivers (e.g. FS-XXXXX DAC cables scraped as OSFP/QSFP28) + if (priceUsd < 1.5) { + return false; + } + const anomalous = await isPriceAnomalous(params.transceiverId, priceUsd); if (anomalous) { return false; // Reject price outside form-factor bounds @@ -96,6 +124,7 @@ export async function upsertPriceObservation(params: { WHERE id = $1 AND (price_verified IS NULL OR price_verified = false OR ${isCompetitor ? "competitor_verified IS NULL OR competitor_verified = false" : "false"})`, [params.transceiverId] ); + await checkAndSetFullyVerified(params.transceiverId); return false; // No change } @@ -131,9 +160,100 @@ export async function upsertPriceObservation(params: { [params.transceiverId] ); } + await checkAndSetFullyVerified(params.transceiverId); return true; // New observation written } +/** + * Upsert a stock observation with full warehouse breakdown (FS.com v2). + * Writes to stock_observations including DE-Lager, Global-Lager, Nachlieferung, + * units_sold, compatible_brands, price_net, and product_url columns. + * Returns true only when the data has changed since the last observation. + */ +export async function upsertStockObservation(params: { + transceiverId: string; + sourceVendorId: string; + stockLevel: string; + quantityAvailable?: number; + warehouseDeQty?: number; + warehouseDeDeliveryDate?: string | null; + warehouseGlobalQty?: number; + warehouseGlobalDeliveryDate?: string | null; + backorderQty?: number; + backorderEstimatedDate?: string | null; + unitsSold?: number; + compatibleBrands?: string[]; + priceNet?: number; + productUrl?: string; +}): Promise { + // Skip if there is genuinely no warehouse data at all + if ( + params.warehouseDeQty === undefined && + params.warehouseGlobalQty === undefined && + params.quantityAvailable === undefined + ) { + return false; + } + + // Compare against the last observation to avoid duplicate writes + const lastObs = await pool.query( + `SELECT warehouse_de_qty, warehouse_global_qty, backorder_qty, units_sold + FROM stock_observations + WHERE transceiver_id = $1 AND source_vendor_id = $2 + ORDER BY time DESC LIMIT 1`, + [params.transceiverId, params.sourceVendorId] + ); + + if (lastObs.rows.length > 0) { + const r = lastObs.rows[0]; + const unchanged = + (r.warehouse_de_qty ?? null) === (params.warehouseDeQty ?? null) && + (r.warehouse_global_qty ?? null) === (params.warehouseGlobalQty ?? null) && + (r.backorder_qty ?? null) === (params.backorderQty ?? null) && + (r.units_sold ?? null) === (params.unitsSold ?? null); + if (unchanged) return false; + } + + const inStock = + ((params.warehouseDeQty ?? 0) + (params.warehouseGlobalQty ?? 0)) > 0; + + await pool.query( + `INSERT INTO stock_observations ( + time, transceiver_id, source_vendor_id, + in_stock, quantity_available, + warehouse_de_qty, warehouse_de_delivery_date, + warehouse_global_qty, warehouse_global_delivery_date, + backorder_qty, backorder_estimated_date, + units_sold, compatible_brands, price_net, product_url + ) VALUES ( + NOW(), $1, $2, + $3, $4, + $5, $6::date, + $7, $8::date, + $9, $10::date, + $11, $12, $13, $14 + )`, + [ + params.transceiverId, + params.sourceVendorId, + inStock, + params.quantityAvailable ?? null, + params.warehouseDeQty ?? null, + params.warehouseDeDeliveryDate ?? null, + params.warehouseGlobalQty ?? null, + params.warehouseGlobalDeliveryDate ?? null, + params.backorderQty ?? null, + params.backorderEstimatedDate ?? null, + params.unitsSold ?? null, + params.compatibleBrands?.length ? params.compatibleBrands : null, + params.priceNet ?? null, + params.productUrl ?? null, + ] + ); + + return true; +} + export async function findOrCreateScrapedTransceiver(params: { partNumber: string; vendorId: string; @@ -160,6 +280,7 @@ export async function findOrCreateScrapedTransceiver(params: { `UPDATE transceivers SET image_url = $1, image_verified = true, updated_at = NOW() WHERE id = $2`, [params.imageUrl, existing.rows[0].id] ); + await checkAndSetFullyVerified(existing.rows[0].id); } return existing.rows[0].id; } diff --git a/run-fs-scraper-mac.sh b/run-fs-scraper-mac.sh new file mode 100755 index 0000000..9e54678 --- /dev/null +++ b/run-fs-scraper-mac.sh @@ -0,0 +1,66 @@ +#!/bin/bash +# FS.com Scraper β€” Mac-side runner +# Runs from this Mac (residential IP) so FS.com isn't blocked. +# Opens SSH tunnel to Erik's DB β†’ runs scraper β†’ closes tunnel. +# +# Schedule: launchd at 02:00, 10:00, 18:00 daily +# Log: ~/Library/Logs/tip-fs-scraper.log + +set -euo pipefail + +LOG="$HOME/Library/Logs/tip-fs-scraper.log" +REPO="/Users/renefichtmueller/Desktop/Claude Code/github-repos/transceiver-db" +NODE="/opt/homebrew/bin/node" +NPX="/opt/homebrew/bin/npx" +TUNNEL_PID_FILE="/tmp/tip-db-tunnel.pid" +DB_LOCAL_PORT=5433 + +log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "$LOG"; } + +# ── Open SSH tunnel if not already running ──────────────────────────────────── +open_tunnel() { + if [ -f "$TUNNEL_PID_FILE" ]; then + PID=$(cat "$TUNNEL_PID_FILE") + if kill -0 "$PID" 2>/dev/null; then + log "Tunnel already running (PID $PID)" + return 0 + fi + fi + log "Opening SSH tunnel β†’ Erik PostgreSQL on port $DB_LOCAL_PORT…" + ssh -N -f -L "${DB_LOCAL_PORT}:localhost:${DB_LOCAL_PORT}" erik + # -f forks to background, no PID tracking needed β€” use pkill to close + log "Tunnel opened" + sleep 2 # Give the tunnel a moment to establish +} + +close_tunnel() { + log "Closing SSH tunnel…" + pkill -f "ssh -N -f -L ${DB_LOCAL_PORT}:localhost:${DB_LOCAL_PORT}" 2>/dev/null || true + rm -f "$TUNNEL_PID_FILE" +} + +# ── Main ────────────────────────────────────────────────────────────────────── +mkdir -p "$(dirname "$LOG")" +log "=== FS.com Mac Scraper starting ===" + +# Only close tunnel if we opened it (not if one was already running) +OPENED_TUNNEL=0 +if ! pgrep -f "ssh -N.*${DB_LOCAL_PORT}:localhost" >/dev/null 2>&1; then + open_tunnel + OPENED_TUNNEL=1 + trap close_tunnel EXIT +fi + +cd "$REPO" + +export POSTGRES_HOST=localhost +export POSTGRES_PORT=$DB_LOCAL_PORT +export POSTGRES_DB=transceiver_db +export POSTGRES_USER=tip +export POSTGRES_PASSWORD=tip_prod_2026 +export NODE_ENV=production + +log "Running fs-com scraper via tsx…" +"$NPX" tsx packages/scraper/src/scrapers/fs-com.ts 2>&1 | tee -a "$LOG" + +log "=== FS.com Mac Scraper complete ===" diff --git a/sql/034-blog-review-tag.sql b/sql/034-blog-review-tag.sql new file mode 100644 index 0000000..289fe66 --- /dev/null +++ b/sql/034-blog-review-tag.sql @@ -0,0 +1,7 @@ +-- Migration 034: Add review_tag column to blog_drafts for manual reviewed tracking +-- Used by dashboard to let editor mark posts as reviewed before publishing + +ALTER TABLE blog_drafts + ADD COLUMN IF NOT EXISTS review_tag VARCHAR(32) DEFAULT NULL; + +COMMENT ON COLUMN blog_drafts.review_tag IS 'Manual review status tag β€” set to ''reviewed'' when editor has proofread the post, NULL otherwise'; diff --git a/sql/035-price-observations-is-anomalous.sql b/sql/035-price-observations-is-anomalous.sql new file mode 100644 index 0000000..6b4c260 --- /dev/null +++ b/sql/035-price-observations-is-anomalous.sql @@ -0,0 +1,12 @@ +-- Migration 035: Add is_anomalous column to price_observations +-- This column marks price entries as outliers/anomalous that should be excluded from display + +ALTER TABLE price_observations + ADD COLUMN IF NOT EXISTS is_anomalous BOOLEAN NOT NULL DEFAULT false; + +CREATE INDEX IF NOT EXISTS idx_price_obs_anomalous + ON price_observations (transceiver_id, is_anomalous) + WHERE is_anomalous = false; + +COMMENT ON COLUMN price_observations.is_anomalous IS + 'True when this price is flagged as an outlier/anomaly and should be excluded from price displays and comparisons'; diff --git a/sql/036-transceiver-equivalences.sql b/sql/036-transceiver-equivalences.sql new file mode 100644 index 0000000..ca15213 --- /dev/null +++ b/sql/036-transceiver-equivalences.sql @@ -0,0 +1,44 @@ +-- Migration 036: Transceiver equivalences for competitor_verified matching +-- Stores semantic equivalences between Flexoptix SKUs and competitor products +-- matched by technical specs (form_factor + speed + reach + standard + fiber_type) + +CREATE TABLE IF NOT EXISTS transceiver_equivalences ( + id UUID DEFAULT gen_random_uuid() PRIMARY KEY, + flexoptix_id UUID NOT NULL REFERENCES transceivers(id) ON DELETE CASCADE, + competitor_id UUID NOT NULL REFERENCES transceivers(id) ON DELETE CASCADE, + confidence DECIMAL(4,3) NOT NULL CHECK (confidence BETWEEN 0 AND 1), + match_basis TEXT[] NOT NULL DEFAULT '{}', -- ['standard_name','form_factor','speed_gbps','fiber_type','reach'] + match_notes TEXT, + status VARCHAR(20) NOT NULL DEFAULT 'pending' + CHECK (status IN ('pending','approved','rejected','auto_approved')), + reviewed_by VARCHAR(200), + reviewed_at TIMESTAMPTZ, + reject_reason TEXT, + created_at TIMESTAMPTZ DEFAULT NOW(), + updated_at TIMESTAMPTZ DEFAULT NOW(), + UNIQUE (flexoptix_id, competitor_id) +); + +CREATE INDEX IF NOT EXISTS idx_eq_flexoptix ON transceiver_equivalences (flexoptix_id); +CREATE INDEX IF NOT EXISTS idx_eq_competitor ON transceiver_equivalences (competitor_id); +CREATE INDEX IF NOT EXISTS idx_eq_status ON transceiver_equivalences (status, confidence DESC); +CREATE INDEX IF NOT EXISTS idx_eq_pending ON transceiver_equivalences (flexoptix_id) WHERE status = 'pending'; + +-- Auto-update updated_at +CREATE OR REPLACE FUNCTION update_equivalences_updated_at() +RETURNS TRIGGER AS $$ +BEGIN + NEW.updated_at = NOW(); + RETURN NEW; +END; +$$ LANGUAGE plpgsql; + +DROP TRIGGER IF EXISTS trg_eq_updated_at ON transceiver_equivalences; +CREATE TRIGGER trg_eq_updated_at + BEFORE UPDATE ON transceiver_equivalences + FOR EACH ROW EXECUTE FUNCTION update_equivalences_updated_at(); + +COMMENT ON TABLE transceiver_equivalences IS + 'Semantic equivalences between Flexoptix SKUs and competitor products, ' + 'matched by technical specification overlap. Used to set competitor_verified=true ' + 'on Flexoptix transceivers that have no exact SKU match at competitors.';