Compare commits
No commits in common. "75cbd7cd869f5610a7f3f13cd500d0b869999dba" and "a4eb02fbdab6226cd2c7b6b9360640fff602b981" have entirely different histories.
75cbd7cd86
...
a4eb02fbda
@ -3,50 +3,8 @@
|
||||
Format: `{"d":"YYYY-MM-DD","t":"TYPE","m":"Description"}`
|
||||
Types: FEAT · FIX · UI · DATA · AI · INFRA
|
||||
|
||||
{"d":"2026-04-17","t":"DATA","m":"Competitor research: QSFPTEK shows real-time aggregated stock count (e.g. '5507 in real-time stock, 17 Apr 2026') + USD prices; NADDOD shows exact per-product counts ('In Stock: 543') via Astro SSR. Both scraped publicly, no login required. Flexoptix confirmed exact Lagerbestand + EUR prices. FS.com: EUR prices yes, exact counts no."}
|
||||
{"d":"2026-04-17","t":"DATA","m":"stock_observations selective cleanup + schema upgrade: TRUNCATE stock_observations (186 FS.com test-run rows cleared, will repopulate on next launchd run). Added 4 new quality columns via migration 038: stock_confidence (1=boolean/2=aggregated/3=per-warehouse), price_currency CHAR(3), price_includes_tax BOOLEAN, stock_vendor_ts TIMESTAMPTZ."}
|
||||
{"d":"2026-04-17","t":"FEAT","m":"Migration 028 retroactively committed to repo (028-stock-observations-warehouse-columns.sql) — documents the 10 warehouse columns applied directly to Erik DB. Guards with IF NOT EXISTS for safe re-application."}
|
||||
{"d":"2026-04-17","t":"FEAT","m":"upsertStockObservation upgraded: new optional params stockConfidence (1|2|3), priceCurrency (ISO 4217), priceIncludesTax (boolean), stockVendorTs (timestamptz). FS.com now writes stockConfidence=3+priceCurrency=EUR+priceIncludesTax=false. Delta detection now also checks quantity_available changes."}
|
||||
{"d":"2026-04-17","t":"FEAT","m":"QSFPTEK scraper v2: Phase 1 uses existing /mall/commodity/list API for product catalog (880+ products from sitemap). Phase 2 fetches /en/product/XXXXX.html detail pages to extract 'X in real-time stock, DATE' — writes stock_observations with stockConfidence=2 + stockVendorTs. Up to 500 detail pages per run at 2s rate limit."}
|
||||
{"d":"2026-04-17","t":"FEAT","m":"NADDOD scraper v2: complete rewrite — migrated from WooCommerce category scraping to Astro sitemap-based discovery (/sitemaps/products.xml, /products/XXXXX.html). Extracts 'In Stock: X' exact counts from server-rendered HTML. Writes both price_observations (USD) and stock_observations (stockConfidence=1 or 2 depending on data visibility)."}
|
||||
{"d":"2026-04-17","t":"DATA","m":"FS.com first warehouse data load: 268 products scraped, 186 stock_observations written — DE-Lager 128,428 units, Global-Lager 156,052 units, Backorder 37,495, 53.4M units sold total. Top seller: SFP-10GSR-85 with 14M units sold."}
|
||||
{"d":"2026-04-17","t":"FIX","m":"upsertStockObservation: skip condition now includes backorder_qty — backorder-only products (DE=0 GL=0 BO>0) like coherent ZR/ZRH were silently dropped instead of being recorded"}
|
||||
{"d":"2026-04-17","t":"FIX","m":"FS.com price extraction: broad fallback regex now only accepts prices >€100 to reject FS.com's €79 'Preis auf Anfrage' placeholder — prevents fake price observations on 1G/10G/25G/40G/100G transceivers"}
|
||||
{"d":"2026-04-17","t":"UI","m":"Dashboard: stock observations count in header stats bar + warehouse stock summary card in Overview tab (hidden until stock_observations populated); both driven by /api/health stock block"}
|
||||
{"d":"2026-04-17","t":"FEAT","m":"Health API: /api/health now includes stock block — total_observations, transceivers_with_stock, vendors_with_stock, total_de_qty, total_global_qty, last_observation_at from stock_observations"}
|
||||
{"d":"2026-04-17","t":"INFRA","m":"FS.com Mac-side runner: launchd plist at 02:00/10:00/18:00 + run-fs-scraper-mac.sh via SSH tunnel to Erik DB port 5433 — residential IP required, datacenter IP blocked by FS.com Cloudflare WAF"}
|
||||
{"d":"2026-04-17","t":"FEAT","m":"Stock API: GET /api/stock, /api/stock/summary, /api/stock/:id — warehouse breakdowns (DE-Lager, Global-Lager, Nachlieferung, units_sold) per transceiver/vendor"}
|
||||
{"d":"2026-04-17","t":"DATA","m":"upsertStockObservation() in db.ts — writes 10 new stock_observations columns (warehouse_de_qty, warehouse_global_qty, backorder_qty, units_sold, compatible_brands, price_net, product_url, delivery dates)"}
|
||||
{"d":"2026-04-17","t":"DATA","m":"FS.com scraper v2: Playwright-based, extracts DE-Lager + Global-Lager + Nachlieferung + Verkauft counts, German number/date parsing, 120-URL pre-queue, 12-category crawl, 12h dedup window"}
|
||||
{"d":"2026-04-17","t":"FIX","m":"SmartOptics scraper v2: WooCommerce REST API fallback + 8 catalog categories + relative URL regex fix — was finding only 8 products, now discovers full catalog"}
|
||||
|
||||
---
|
||||
|
||||
{"d":"2026-04-12","t":"FIX","m":"DB functions compute_transceiver_verification() + compute_transceiver_verification(uuid): both now require competitor_verified as 4th criterion for fully_verified — was silently ignoring competitor check and granting ★ 100% badge based on only 3 criteria"}
|
||||
{"d":"2026-04-12","t":"FEAT","m":"Scheduler: maintenance:reconcile-verification nightly job (01:00 UTC via pg-boss) — auto-resets competitor_verified=false where no non-Flexoptix price_observation in last 30 days, then recomputes fully_verified — eliminates recurring false ★ 100% badges without manual SQL intervention"}
|
||||
{"d":"2026-04-12","t":"DATA","m":"Data quality: 608 transceivers had competitor_verified=true with NO actual non-Flexoptix price in last 30 days — all reset to false + fully_verified=false. ★ 100% badge now only shows when genuinely earned. Triggered by user catching false badges on 1.6T OSFP products."}
|
||||
{"d":"2026-04-12","t":"FIX","m":"ATGBICS + FS.COM scrapers: PlaywrightCrawler useSessionPool=false added — eliminates SDK_SESSION_POOL_STATE.json crash on every run; withIsolatedStorage now pre-seeds empty session state file as belt-and-suspenders"}
|
||||
{"d":"2026-04-12","t":"FIX","m":"Skylane scraper: pagination now breaks on zero NEW unique product URLs (was looping all 10 pages because Algolia returns same content regardless of ?page=N)"}
|
||||
{"d":"2026-04-12","t":"FIX","m":"AscentOptics scraper fully rewritten: uses /product-list?is_render=1&category_id=CID JSON API (was hitting 404 on old /catalog/ URLs); hardcoded category IDs for 14 transceiver form factors; no prices (OEM Get Quote model)"}
|
||||
{"d":"2026-04-12","t":"UI","m":"Dashboard transceiver table: VERIFIED column now shows all 4 individual criteria per row (✓/— P=Price, I=Image, D=Details, C=Competitor) in green/red — ★ 100% badge only when all 4 met; uses competitor_verified DB column"}
|
||||
{"d":"2026-04-12","t":"FIX","m":"Data quality: 59 anomalous price observations deleted (FS.COM accessories EUR 1-18 misidentified as OSFP/QSFP-DD/QSFP28; ATGBICS QSFP-DD sub-$60) — 49 transceivers competitor_verified degraded to false, 1 fully_verified badge removed"}
|
||||
{"d":"2026-04-12","t":"FIX","m":"upsertPriceObservation: hard floor $1.50 USD added before form-factor bounds check — catches accessories/cables misidentified as transceivers when form_factor defaults to SFP with loose [2,3000] bounds"}
|
||||
{"d":"2026-04-12","t":"FIX","m":"GBICS scraper: attribute order changed on site — regex updated from aria-label→href→data-event-type to dual-pass href+aria-label (both orders), data-event-type no longer required; prices now correctly extracted"}
|
||||
{"d":"2026-04-12","t":"FIX","m":"Scheduler: 11 missing boss.work() handlers added for lightweight scrapers (fluxlight, gbics, optcore, champion-one, sfpcables, blueoptics, fiber24, tscom, skylane, ascentoptics, gaotek) — jobs were queued by cron but never consumed; scrapers stale 24-48h"}
|
||||
{"d":"2026-04-12","t":"FIX","m":"withIsolatedStorage: removed rmSync cleanup of Crawlee storage dir — dir deletion caused SDK_SESSION_POOL_STATE.json not found crash on every Playwright scraper restart (ATGBICS/FS.COM failed every 2h cycle)"}
|
||||
{"d":"2026-04-12","t":"FEAT","m":"Scheduler: monitor:scraper-health job added (every 3h via pg-boss) — checks price_observations per vendor in last 6h, logs SCRAPER HEALTH ALERT to pm2 stderr for any vendor with 0 new prices"}
|
||||
{"d":"2026-04-12","t":"FIX","m":"Health check vendor names corrected: SFPCables→SFPcables, Fiber24→ShopFiber24, T&S Com→T&S Communication to match actual vendor table values"}
|
||||
{"d":"2026-04-12","t":"FIX","m":"FiberMall scraper: URL schema corrected — wrong /c/1g-sfp-transceiver/ paths (HTTP 404) replaced with actual /store-XXXXX-name.htm category URLs discovered via homepage navigation scrape"}
|
||||
{"d":"2026-04-12","t":"FIX","m":"FiberMall parser: product card split on new_proList_mainListLi (Vue.js SSR), price extracted from <span class=currency_price data-price=X.XX> — fixed false-match on data-price=0.00 from SKU variant items that appears before real price in each card"}
|
||||
{"d":"2026-04-12","t":"FIX","m":"FiberMall: also scrapes SKU brand variants from .sku_item divs within each product group (Cisco/Arista/Juniper compatible versions listed per product)"}
|
||||
{"d":"2026-04-12","t":"FIX","m":"Flexoptix price parsing: EUR text regex /([\d.]+)\s*EUR/ matched only digits before thousand separator (2,921.60 EUR → 2 EUR) — fixed to /([\d,]+\.?\d*)\s*EUR/ with comma strip; affects all Flexoptix prices >999 EUR"}
|
||||
{"d":"2026-04-12","t":"FIX","m":"Flexoptix catalog: O.138HG2.C.05 (1.6T OSFP224 2x DR4) price corrected 3009.60→2921.60 EUR (stale since 2026-04-09, Flexoptix.net shows FLEXBOX price 2921.60 via data-price-amount attribute)"}
|
||||
{"d":"2026-04-12","t":"FEAT","m":"Flexoptix catalog: 4 new search queries added — OSFP224 1.6T, OSFP224, 1.6T DR4, 1.6T transceiver — covers new 1.6T form factor previously missing entirely from catalog scraper"}
|
||||
{"d":"2026-04-12","t":"FIX","m":"Schema: competitor_verified + competitor_verified_at columns added to transceivers table (ALTER TABLE) — were referenced in db.ts upsertPriceObservation but not in schema, causing price writes to fail silently for all competitor vendors (FiberMall, QSFPTEK etc.)"}
|
||||
{"d":"2026-04-11","t":"FEAT","m":"Scraper coverage expansion: 3 new scrapers added — FiberMall (fibermall.com, USD), Vcelink (vcelink.com, USD, Shopify), OpticsBay (opticsbay.com, USD, WooCommerce) — all wired into scheduler and Pi fleet"}
|
||||
{"d":"2026-04-11","t":"FIX","m":"QSFPTEK scraper fully rewritten: site migrated from OpenCart to custom Java/Spring+Vue — old /c/*.html paths 404, now uses /mall/commodity/list API with attribute-based data rate filtering; 8 attribute IDs for 1G/10G/25G/40G/100G/200G/400G/800G"}
|
||||
{"d":"2026-04-11","t":"INFRA","m":"Scheduler: 61 workers total, 53 cron schedules — FiberMall/Vcelink/OpticsBay added at :03, :07, :57 past even hours"}
|
||||
{"d":"2026-04-09","t":"FEAT","m":"Price anomaly detection: PRICE_BOUNDS per form-factor in db.ts upsertPriceObservation — prices outside [min,max] USD range silently rejected to prevent garbage data (e.g. SFP+ [4, 5000], OSFP224 [200, 60000])"}
|
||||
{"d":"2026-04-09","t":"UI","m":"Dashboard: LLM panel redesigned for light theme readability; LLM model selector added to Blog Engine tab"}
|
||||
{"d":"2026-04-09","t":"INFRA","m":"Pi Starlink proxy-agent: scraper routes selected lightweight scrapers exclusively to Pi worker fleet via SOCKS5 — no Playwright traffic on Pi nodes"}
|
||||
{"d":"2026-04-09","t":"DATA","m":"800G standards deep enrichment: migration 033 — IEEE 802.3df, OIF 800G IA, 800G MSA, OSFP MSA, QSFP-DD800 MSA with links, status, timeline"}
|
||||
|
||||
@ -28,8 +28,6 @@ import { procurementRouter } from "./routes/procurement";
|
||||
import { changelogRouter } from "./routes/changelog";
|
||||
import { newsRouter } from "./routes/news";
|
||||
import { proxyRouter } from "./routes/proxy";
|
||||
import { reviewRouter } from "./routes/review";
|
||||
import { stockRouter } from "./routes/stock";
|
||||
|
||||
const app = express();
|
||||
|
||||
@ -86,8 +84,6 @@ app.use("/api/hot-topics", hotTopicsRouter);
|
||||
app.use("/api/procurement", procurementRouter);
|
||||
app.use("/api/changelog", changelogRouter);
|
||||
app.use("/api/news", newsRouter);
|
||||
app.use("/api/review", reviewRouter);
|
||||
app.use("/api/stock", stockRouter);
|
||||
|
||||
// Dashboard (static HTML)
|
||||
app.use("/dashboard", express.static(join(__dirname, "..", "..", "dashboard")));
|
||||
|
||||
@ -1,348 +0,0 @@
|
||||
/**
|
||||
* Manual Review API — Transceiver Equivalence Review Queue
|
||||
*
|
||||
* GET /api/review/equivalences — list (filter by status)
|
||||
* GET /api/review/equivalences/stats — pending/approved/rejected counts
|
||||
* POST /api/review/equivalences/:id/approve — approve + set competitor_verified
|
||||
* POST /api/review/equivalences/:id/reject — reject with optional reason
|
||||
* PATCH /api/review/equivalences/:id — edit match_notes
|
||||
* POST /api/review/run-matcher — trigger equivalence job immediately
|
||||
*/
|
||||
import { Router, Request, Response } from "express";
|
||||
import { pool } from "../db/client";
|
||||
|
||||
/** Promote to fully_verified if all 4 flags are set — shared logic */
|
||||
async function checkAndSetFullyVerified(transceiverId: string): Promise<boolean> {
|
||||
const result = await pool.query(
|
||||
`UPDATE transceivers
|
||||
SET fully_verified = true,
|
||||
fully_verified_at = COALESCE(fully_verified_at, NOW())
|
||||
WHERE id = $1
|
||||
AND price_verified = true AND image_verified = true
|
||||
AND details_verified = true AND competitor_verified = true
|
||||
AND (fully_verified IS NULL OR fully_verified = false)
|
||||
RETURNING id`,
|
||||
[transceiverId]
|
||||
);
|
||||
return (result.rowCount ?? 0) > 0;
|
||||
}
|
||||
|
||||
export const reviewRouter = Router();
|
||||
|
||||
// ── GET /api/review/equivalences ──────────────────────────────────────────────
|
||||
reviewRouter.get("/equivalences", async (req: Request, res: Response) => {
|
||||
const status = (req.query.status as string) || "pending";
|
||||
const page = Math.max(1, parseInt(req.query.page as string, 10) || 1);
|
||||
const limit = Math.min(100, parseInt(req.query.limit as string, 10) || 50);
|
||||
const offset = (page - 1) * limit;
|
||||
|
||||
const validStatuses = ["pending", "approved", "rejected", "auto_approved", "all", "needs_research"];
|
||||
if (!validStatuses.includes(status)) {
|
||||
res.status(400).json({ success: false, error: "Invalid status filter" });
|
||||
return;
|
||||
}
|
||||
|
||||
let where: string;
|
||||
let params: unknown[];
|
||||
let limitIdx: number;
|
||||
let offsetIdx: number;
|
||||
if (status === "all") {
|
||||
where = "";
|
||||
params = [limit, offset];
|
||||
limitIdx = 1; offsetIdx = 2;
|
||||
} else if (status === "needs_research") {
|
||||
where = `WHERE eq.status IN ('approved','auto_approved') AND eq.re_research_due_at IS NOT NULL AND eq.re_research_due_at <= NOW()`;
|
||||
params = [limit, offset];
|
||||
limitIdx = 1; offsetIdx = 2;
|
||||
} else {
|
||||
where = `WHERE eq.status = $1`;
|
||||
params = [status, limit, offset];
|
||||
limitIdx = 2; offsetIdx = 3;
|
||||
}
|
||||
|
||||
const rows = await pool.query(`
|
||||
SELECT
|
||||
eq.id,
|
||||
eq.confidence,
|
||||
eq.match_basis,
|
||||
eq.match_notes,
|
||||
eq.status,
|
||||
eq.reviewed_by,
|
||||
eq.reviewed_at,
|
||||
eq.reject_reason,
|
||||
eq.re_research_due_at,
|
||||
eq.re_researched_at,
|
||||
eq.created_at,
|
||||
eq.updated_at,
|
||||
-- Flexoptix transceiver
|
||||
fx.id AS fx_id,
|
||||
fx.part_number AS fx_part_number,
|
||||
fx.standard_name AS fx_standard_name,
|
||||
fx.form_factor AS fx_form_factor,
|
||||
fx.speed AS fx_speed,
|
||||
fx.speed_gbps AS fx_speed_gbps,
|
||||
fx.fiber_type AS fx_fiber_type,
|
||||
fx.reach_meters AS fx_reach_meters,
|
||||
fx.reach_label AS fx_reach_label,
|
||||
fx.wavelengths AS fx_wavelengths,
|
||||
fx.connector AS fx_connector,
|
||||
fx.product_page_url AS fx_url,
|
||||
fxv.name AS fx_vendor,
|
||||
-- Competitor transceiver
|
||||
cp.id AS cp_id,
|
||||
cp.part_number AS cp_part_number,
|
||||
cp.standard_name AS cp_standard_name,
|
||||
cp.form_factor AS cp_form_factor,
|
||||
cp.speed AS cp_speed,
|
||||
cp.speed_gbps AS cp_speed_gbps,
|
||||
cp.fiber_type AS cp_fiber_type,
|
||||
cp.reach_meters AS cp_reach_meters,
|
||||
cp.reach_label AS cp_reach_label,
|
||||
cp.wavelengths AS cp_wavelengths,
|
||||
cp.connector AS cp_connector,
|
||||
cp.product_page_url AS cp_url,
|
||||
cpv.name AS cp_vendor,
|
||||
-- Latest competitor price
|
||||
(SELECT po.price FROM price_observations po
|
||||
WHERE po.transceiver_id = cp.id
|
||||
AND po.time > NOW() - INTERVAL '30 days'
|
||||
ORDER BY po.time DESC LIMIT 1) AS cp_latest_price,
|
||||
(SELECT po.currency FROM price_observations po
|
||||
WHERE po.transceiver_id = cp.id
|
||||
AND po.time > NOW() - INTERVAL '30 days'
|
||||
ORDER BY po.time DESC LIMIT 1) AS cp_latest_currency
|
||||
FROM transceiver_equivalences eq
|
||||
JOIN transceivers fx ON fx.id = eq.flexoptix_id
|
||||
JOIN vendors fxv ON fxv.id = fx.vendor_id
|
||||
JOIN transceivers cp ON cp.id = eq.competitor_id
|
||||
JOIN vendors cpv ON cpv.id = cp.vendor_id
|
||||
${where}
|
||||
ORDER BY eq.confidence DESC, eq.created_at DESC
|
||||
LIMIT $${limitIdx} OFFSET $${offsetIdx}
|
||||
`, params);
|
||||
|
||||
const countResult = await pool.query(
|
||||
`SELECT COUNT(*) FROM transceiver_equivalences eq ${where}`,
|
||||
(status === "all" || status === "needs_research") ? [] : [status]
|
||||
);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
data: rows.rows,
|
||||
total: parseInt(countResult.rows[0].count, 10),
|
||||
page,
|
||||
limit,
|
||||
});
|
||||
});
|
||||
|
||||
// ── GET /api/review/equivalences/stats ────────────────────────────────────────
|
||||
reviewRouter.get("/equivalences/stats", async (_req: Request, res: Response) => {
|
||||
const result = await pool.query(`
|
||||
SELECT
|
||||
SUM(CASE WHEN status = 'pending' THEN 1 ELSE 0 END) AS pending,
|
||||
SUM(CASE WHEN status = 'approved' THEN 1 ELSE 0 END) AS approved,
|
||||
SUM(CASE WHEN status = 'auto_approved' THEN 1 ELSE 0 END) AS auto_approved,
|
||||
SUM(CASE WHEN status = 'rejected' THEN 1 ELSE 0 END) AS rejected,
|
||||
SUM(CASE WHEN status IN ('approved','auto_approved')
|
||||
AND re_research_due_at IS NOT NULL
|
||||
AND re_research_due_at <= NOW() THEN 1 ELSE 0 END) AS needs_research,
|
||||
COUNT(*) AS total
|
||||
FROM transceiver_equivalences
|
||||
`);
|
||||
|
||||
const row = result.rows[0];
|
||||
res.json({
|
||||
success: true,
|
||||
stats: {
|
||||
pending: parseInt(row.pending, 10) || 0,
|
||||
approved: parseInt(row.approved, 10) || 0,
|
||||
auto_approved: parseInt(row.auto_approved, 10) || 0,
|
||||
rejected: parseInt(row.rejected, 10) || 0,
|
||||
needs_research: parseInt(row.needs_research, 10) || 0,
|
||||
total: parseInt(row.total, 10) || 0,
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
// ── POST /api/review/equivalences/:id/approve ─────────────────────────────────
|
||||
reviewRouter.post("/equivalences/:id/approve", async (req: Request, res: Response) => {
|
||||
const { id } = req.params;
|
||||
const reviewer = (req.body as { reviewer?: string }).reviewer || "manual";
|
||||
|
||||
// Fetch the equivalence to get flexoptix_id
|
||||
const eq = await pool.query(
|
||||
`SELECT * FROM transceiver_equivalences WHERE id = $1`,
|
||||
[id]
|
||||
);
|
||||
if (!eq.rows[0]) {
|
||||
res.status(404).json({ success: false, error: "Not found" });
|
||||
return;
|
||||
}
|
||||
|
||||
const { flexoptix_id } = eq.rows[0] as { flexoptix_id: string };
|
||||
|
||||
// Mark approved
|
||||
await pool.query(`
|
||||
UPDATE transceiver_equivalences
|
||||
SET status = 'approved', reviewed_by = $2, reviewed_at = NOW()
|
||||
WHERE id = $1
|
||||
`, [id, reviewer]);
|
||||
|
||||
// Set competitor_verified on the Flexoptix transceiver
|
||||
await pool.query(`
|
||||
UPDATE transceivers
|
||||
SET competitor_verified = true,
|
||||
competitor_verified_at = NOW()
|
||||
WHERE id = $1
|
||||
`, [flexoptix_id]);
|
||||
|
||||
// Promote to fully_verified if all 4 flags are now set
|
||||
const fullyVerifiedEarned = await checkAndSetFullyVerified(flexoptix_id);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
fully_verified_earned: fullyVerifiedEarned,
|
||||
});
|
||||
});
|
||||
|
||||
// ── POST /api/review/equivalences/:id/reject ──────────────────────────────────
|
||||
reviewRouter.post("/equivalences/:id/reject", async (req: Request, res: Response) => {
|
||||
const { id } = req.params;
|
||||
const { reason, reviewer } = req.body as { reason?: string; reviewer?: string };
|
||||
|
||||
const result = await pool.query(`
|
||||
UPDATE transceiver_equivalences
|
||||
SET status = 'rejected',
|
||||
reject_reason = $2,
|
||||
reviewed_by = $3,
|
||||
reviewed_at = NOW()
|
||||
WHERE id = $1
|
||||
RETURNING id
|
||||
`, [id, reason || null, reviewer || "manual"]);
|
||||
|
||||
if (!result.rowCount) {
|
||||
res.status(404).json({ success: false, error: "Not found" });
|
||||
return;
|
||||
}
|
||||
|
||||
res.json({ success: true });
|
||||
});
|
||||
|
||||
// ── PATCH /api/review/equivalences/:id ────────────────────────────────────────
|
||||
reviewRouter.patch("/equivalences/:id", async (req: Request, res: Response) => {
|
||||
const { id } = req.params;
|
||||
const { match_notes } = req.body as { match_notes?: string };
|
||||
|
||||
if (match_notes === undefined) {
|
||||
res.status(400).json({ success: false, error: "match_notes required" });
|
||||
return;
|
||||
}
|
||||
|
||||
const result = await pool.query(`
|
||||
UPDATE transceiver_equivalences
|
||||
SET match_notes = $2, updated_at = NOW()
|
||||
WHERE id = $1
|
||||
RETURNING id
|
||||
`, [id, match_notes]);
|
||||
|
||||
if (!result.rowCount) {
|
||||
res.status(404).json({ success: false, error: "Not found" });
|
||||
return;
|
||||
}
|
||||
|
||||
res.json({ success: true });
|
||||
});
|
||||
|
||||
// ── POST /api/review/equivalences/approve-all ─────────────────────────────────
|
||||
// Approve ALL pending equivalences regardless of confidence.
|
||||
// Low-confidence ones (< 0.73) get re_research_due_at = NOW() so the nightly
|
||||
// re-research job will re-verify them one by one.
|
||||
reviewRouter.post("/equivalences/approve-all", async (req: Request, res: Response) => {
|
||||
const reviewer = (req.body as { reviewer?: string }).reviewer || "approve-all";
|
||||
const RE_RESEARCH_THRESHOLD = 0.73;
|
||||
|
||||
const candidates = await pool.query(`
|
||||
SELECT id, flexoptix_id, confidence FROM transceiver_equivalences WHERE status = 'pending'
|
||||
`);
|
||||
|
||||
let approved = 0;
|
||||
let fullyVerified = 0;
|
||||
let scheduledReSearch = 0;
|
||||
|
||||
for (const row of candidates.rows) {
|
||||
const needsReSearch = parseFloat(row.confidence) < RE_RESEARCH_THRESHOLD;
|
||||
await pool.query(`
|
||||
UPDATE transceiver_equivalences
|
||||
SET status = 'approved',
|
||||
reviewed_by = $2,
|
||||
reviewed_at = NOW(),
|
||||
re_research_due_at = $3,
|
||||
re_researched_at = NULL
|
||||
WHERE id = $1
|
||||
`, [row.id, reviewer, needsReSearch ? new Date() : null]);
|
||||
|
||||
await pool.query(`
|
||||
UPDATE transceivers
|
||||
SET competitor_verified = true, competitor_verified_at = NOW()
|
||||
WHERE id = $1 AND competitor_verified = false
|
||||
`, [row.flexoptix_id]);
|
||||
|
||||
const earned = await checkAndSetFullyVerified(row.flexoptix_id);
|
||||
if (earned) fullyVerified++;
|
||||
if (needsReSearch) scheduledReSearch++;
|
||||
approved++;
|
||||
}
|
||||
|
||||
res.json({ success: true, approved, fully_verified_earned: fullyVerified, scheduled_re_research: scheduledReSearch });
|
||||
});
|
||||
|
||||
// ── POST /api/review/equivalences/bulk-approve ────────────────────────────────
|
||||
// Bulk-approve all pending equivalences with confidence >= threshold (default 0.73)
|
||||
reviewRouter.post("/equivalences/bulk-approve", async (req: Request, res: Response) => {
|
||||
const threshold = Math.max(0, Math.min(1, parseFloat((req.body as { threshold?: string }).threshold as string) || 0.73));
|
||||
const reviewer = (req.body as { reviewer?: string }).reviewer || "bulk-dashboard";
|
||||
|
||||
// Fetch all pending records above threshold
|
||||
const candidates = await pool.query(`
|
||||
SELECT id, flexoptix_id
|
||||
FROM transceiver_equivalences
|
||||
WHERE status = 'pending' AND confidence >= $1
|
||||
`, [threshold]);
|
||||
|
||||
let approved = 0;
|
||||
let fullyVerified = 0;
|
||||
|
||||
for (const row of candidates.rows) {
|
||||
await pool.query(`
|
||||
UPDATE transceiver_equivalences
|
||||
SET status = 'approved', reviewed_by = $2, reviewed_at = NOW()
|
||||
WHERE id = $1
|
||||
`, [row.id, reviewer]);
|
||||
|
||||
await pool.query(`
|
||||
UPDATE transceivers
|
||||
SET competitor_verified = true, competitor_verified_at = NOW()
|
||||
WHERE id = $1 AND competitor_verified = false
|
||||
`, [row.flexoptix_id]);
|
||||
|
||||
const earned = await checkAndSetFullyVerified(row.flexoptix_id);
|
||||
if (earned) fullyVerified++;
|
||||
approved++;
|
||||
}
|
||||
|
||||
res.json({ success: true, approved, fully_verified_earned: fullyVerified, threshold });
|
||||
});
|
||||
|
||||
// ── POST /api/review/run-matcher ──────────────────────────────────────────────
|
||||
// Trigger the equivalence matcher immediately (admin action)
|
||||
reviewRouter.post("/run-matcher", async (_req: Request, res: Response) => {
|
||||
// Queue the job via pg-boss — import from scraper's db util won't work here,
|
||||
// so we fire directly via DB insert into pg-boss queue
|
||||
await pool.query(`
|
||||
INSERT INTO pgboss.job (name, data, priority)
|
||||
VALUES ('maintenance:find-equivalences', '{}', 0)
|
||||
ON CONFLICT DO NOTHING
|
||||
`);
|
||||
|
||||
res.json({ success: true, message: "Equivalence matcher queued" });
|
||||
});
|
||||
@ -1,332 +0,0 @@
|
||||
/**
|
||||
* Stock Observations API
|
||||
*
|
||||
* Exposes warehouse stock data scraped from fs.com (DE-Lager, Global-Lager,
|
||||
* Nachlieferung, units_sold, compatible_brands) and other vendors.
|
||||
*
|
||||
* Routes:
|
||||
* GET /api/stock — Latest obs per transceiver × vendor (paginated)
|
||||
* GET /api/stock/summary — Aggregate warehouse stats (totals, top movers)
|
||||
* GET /api/stock/:transceiverIdOrSku — Full obs history for one transceiver
|
||||
*/
|
||||
import { Router, Request, Response } from "express";
|
||||
import { pool } from "../db/client";
|
||||
|
||||
export const stockRouter = Router();
|
||||
|
||||
// ─── helpers ─────────────────────────────────────────────────────────────────
|
||||
|
||||
function intParam(req: Request, name: string, fallback: number): number {
|
||||
const v = req.query[name];
|
||||
const parsed = v ? parseInt(String(v), 10) : NaN;
|
||||
return Number.isFinite(parsed) ? parsed : fallback;
|
||||
}
|
||||
|
||||
// ─── GET /api/stock ──────────────────────────────────────────────────────────
|
||||
/**
|
||||
* Returns the most recent stock observation per (transceiver, vendor) pair.
|
||||
* Query params:
|
||||
* vendor_id — filter by source vendor UUID
|
||||
* in_stock — "true" | "false"
|
||||
* min_de — minimum DE-Lager quantity
|
||||
* min_global — minimum Global-Lager quantity
|
||||
* part_number — partial match on part_number
|
||||
* limit — default 50, max 200
|
||||
* offset — default 0
|
||||
*/
|
||||
stockRouter.get("/", async (req: Request, res: Response) => {
|
||||
try {
|
||||
const limit = Math.min(intParam(req, "limit", 50), 200);
|
||||
const offset = intParam(req, "offset", 0);
|
||||
const vendorId = req.query.vendor_id ? String(req.query.vendor_id) : null;
|
||||
const inStock = req.query.in_stock === "true" ? true : req.query.in_stock === "false" ? false : null;
|
||||
const minDe = req.query.min_de ? parseInt(String(req.query.min_de), 10) : null;
|
||||
const minGlobal = req.query.min_global ? parseInt(String(req.query.min_global), 10) : null;
|
||||
const partNumber = req.query.part_number ? String(req.query.part_number) : null;
|
||||
|
||||
const conditions: string[] = [];
|
||||
const params: unknown[] = [];
|
||||
let p = 1;
|
||||
|
||||
if (vendorId) { conditions.push(`so.source_vendor_id = $${p++}`); params.push(vendorId); }
|
||||
if (inStock !== null) { conditions.push(`so.in_stock = $${p++}`); params.push(inStock); }
|
||||
if (minDe !== null) { conditions.push(`so.warehouse_de_qty >= $${p++}`); params.push(minDe); }
|
||||
if (minGlobal !== null) { conditions.push(`so.warehouse_global_qty >= $${p++}`); params.push(minGlobal); }
|
||||
if (partNumber) { conditions.push(`t.part_number ILIKE $${p++}`); params.push(`%${partNumber}%`); }
|
||||
|
||||
const whereClause = conditions.length ? `AND ${conditions.join(" AND ")}` : "";
|
||||
|
||||
const sql = `
|
||||
SELECT
|
||||
so.time,
|
||||
t.id AS transceiver_id,
|
||||
t.part_number,
|
||||
t.form_factor,
|
||||
t.speed,
|
||||
v.name AS vendor_name,
|
||||
v.website AS vendor_website,
|
||||
so.in_stock,
|
||||
so.quantity_available,
|
||||
so.warehouse_de_qty,
|
||||
so.warehouse_de_delivery_date,
|
||||
so.warehouse_global_qty,
|
||||
so.warehouse_global_delivery_date,
|
||||
so.backorder_qty,
|
||||
so.backorder_estimated_date,
|
||||
so.units_sold,
|
||||
so.compatible_brands,
|
||||
so.price_net,
|
||||
so.product_url
|
||||
FROM (
|
||||
SELECT DISTINCT ON (transceiver_id, source_vendor_id) *
|
||||
FROM stock_observations
|
||||
ORDER BY transceiver_id, source_vendor_id, time DESC
|
||||
) so
|
||||
JOIN transceivers t ON t.id = so.transceiver_id
|
||||
JOIN vendors v ON v.id = so.source_vendor_id
|
||||
WHERE 1=1 ${whereClause}
|
||||
ORDER BY so.time DESC
|
||||
LIMIT $${p++} OFFSET $${p++}
|
||||
`;
|
||||
params.push(limit, offset);
|
||||
|
||||
const countSql = `
|
||||
SELECT COUNT(*) FROM (
|
||||
SELECT DISTINCT ON (transceiver_id, source_vendor_id) *
|
||||
FROM stock_observations
|
||||
ORDER BY transceiver_id, source_vendor_id, time DESC
|
||||
) so
|
||||
JOIN transceivers t ON t.id = so.transceiver_id
|
||||
JOIN vendors v ON v.id = so.source_vendor_id
|
||||
WHERE 1=1 ${whereClause}
|
||||
`;
|
||||
|
||||
const [rows, countRow] = await Promise.all([
|
||||
pool.query(sql, params),
|
||||
pool.query(countSql, params.slice(0, params.length - 2)),
|
||||
]);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
data: rows.rows,
|
||||
meta: {
|
||||
total: parseInt(countRow.rows[0].count, 10),
|
||||
limit,
|
||||
offset,
|
||||
},
|
||||
});
|
||||
} catch (err) {
|
||||
console.error("GET /api/stock error:", err);
|
||||
res.status(500).json({ success: false, error: "Internal server error" });
|
||||
}
|
||||
});
|
||||
|
||||
// ─── GET /api/stock/summary ──────────────────────────────────────────────────
|
||||
/**
|
||||
* Aggregate stats across all latest stock observations.
|
||||
* Returns totals per warehouse tier, top sellers, and per-vendor breakdown.
|
||||
*/
|
||||
stockRouter.get("/summary", async (req: Request, res: Response) => {
|
||||
try {
|
||||
const [totals, topSellers, vendorBreakdown, recentlyUpdated] = await Promise.all([
|
||||
// Overall totals from latest observations
|
||||
pool.query(`
|
||||
WITH latest AS (
|
||||
SELECT DISTINCT ON (transceiver_id, source_vendor_id) *
|
||||
FROM stock_observations
|
||||
ORDER BY transceiver_id, source_vendor_id, time DESC
|
||||
)
|
||||
SELECT
|
||||
COUNT(*) AS total_observations,
|
||||
COUNT(*) FILTER (WHERE in_stock = true) AS in_stock_count,
|
||||
SUM(COALESCE(warehouse_de_qty, 0)) AS total_de_qty,
|
||||
SUM(COALESCE(warehouse_global_qty, 0)) AS total_global_qty,
|
||||
SUM(COALESCE(backorder_qty, 0)) AS total_backorder_qty,
|
||||
COUNT(*) FILTER (WHERE warehouse_de_qty > 0) AS products_with_de_stock,
|
||||
COUNT(*) FILTER (WHERE warehouse_global_qty > 0) AS products_with_global_stock,
|
||||
COUNT(*) FILTER (WHERE backorder_qty > 0) AS products_with_backorder,
|
||||
COUNT(DISTINCT transceiver_id) AS unique_transceivers,
|
||||
COUNT(DISTINCT source_vendor_id) AS unique_vendors
|
||||
FROM latest
|
||||
`),
|
||||
|
||||
// Top sellers by units_sold
|
||||
pool.query(`
|
||||
WITH latest AS (
|
||||
SELECT DISTINCT ON (transceiver_id, source_vendor_id) *
|
||||
FROM stock_observations
|
||||
WHERE units_sold IS NOT NULL
|
||||
ORDER BY transceiver_id, source_vendor_id, time DESC
|
||||
)
|
||||
SELECT
|
||||
t.part_number,
|
||||
t.form_factor,
|
||||
t.speed,
|
||||
v.name AS vendor_name,
|
||||
so.units_sold,
|
||||
so.warehouse_de_qty,
|
||||
so.warehouse_global_qty,
|
||||
so.price_net,
|
||||
so.product_url
|
||||
FROM latest so
|
||||
JOIN transceivers t ON t.id = so.transceiver_id
|
||||
JOIN vendors v ON v.id = so.source_vendor_id
|
||||
ORDER BY so.units_sold DESC
|
||||
LIMIT 20
|
||||
`),
|
||||
|
||||
// Per-vendor stock breakdown
|
||||
pool.query(`
|
||||
WITH latest AS (
|
||||
SELECT DISTINCT ON (transceiver_id, source_vendor_id) *
|
||||
FROM stock_observations
|
||||
ORDER BY transceiver_id, source_vendor_id, time DESC
|
||||
)
|
||||
SELECT
|
||||
v.name AS vendor_name,
|
||||
v.website AS vendor_website,
|
||||
COUNT(*) AS product_count,
|
||||
COUNT(*) FILTER (WHERE so.in_stock = true) AS in_stock_count,
|
||||
SUM(COALESCE(so.warehouse_de_qty, 0)) AS total_de_qty,
|
||||
SUM(COALESCE(so.warehouse_global_qty, 0)) AS total_global_qty,
|
||||
SUM(COALESCE(so.backorder_qty, 0)) AS total_backorder,
|
||||
MAX(so.time) AS last_scraped
|
||||
FROM latest so
|
||||
JOIN vendors v ON v.id = so.source_vendor_id
|
||||
GROUP BY v.id, v.name, v.website
|
||||
ORDER BY product_count DESC
|
||||
`),
|
||||
|
||||
// Recently restocked (stock appeared in last 24h)
|
||||
pool.query(`
|
||||
SELECT
|
||||
t.part_number,
|
||||
t.form_factor,
|
||||
t.speed,
|
||||
v.name AS vendor_name,
|
||||
so.warehouse_de_qty,
|
||||
so.warehouse_global_qty,
|
||||
so.time AS observed_at
|
||||
FROM stock_observations so
|
||||
JOIN transceivers t ON t.id = so.transceiver_id
|
||||
JOIN vendors v ON v.id = so.source_vendor_id
|
||||
WHERE so.time >= NOW() - INTERVAL '24 hours'
|
||||
AND so.in_stock = true
|
||||
AND (so.warehouse_de_qty > 0 OR so.warehouse_global_qty > 0)
|
||||
ORDER BY so.time DESC
|
||||
LIMIT 10
|
||||
`),
|
||||
]);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
data: {
|
||||
totals: totals.rows[0],
|
||||
top_sellers: topSellers.rows,
|
||||
vendor_breakdown: vendorBreakdown.rows,
|
||||
recently_updated: recentlyUpdated.rows,
|
||||
},
|
||||
});
|
||||
} catch (err) {
|
||||
console.error("GET /api/stock/summary error:", err);
|
||||
res.status(500).json({ success: false, error: "Internal server error" });
|
||||
}
|
||||
});
|
||||
|
||||
// ─── GET /api/stock/:id ──────────────────────────────────────────────────────
|
||||
/**
|
||||
* Full observation history for one transceiver.
|
||||
* :id can be a UUID or a part_number (case-insensitive).
|
||||
* Query params:
|
||||
* vendor_id — filter by vendor UUID
|
||||
* days — look-back window in days (default 30)
|
||||
* limit — max observations returned (default 100)
|
||||
*/
|
||||
stockRouter.get("/:id", async (req: Request, res: Response) => {
|
||||
try {
|
||||
const id = String(req.params.id);
|
||||
const days = intParam(req, "days", 30);
|
||||
const limit = Math.min(intParam(req, "limit", 100), 500);
|
||||
const vendorId = req.query.vendor_id ? String(req.query.vendor_id) : null;
|
||||
|
||||
// Resolve UUID vs part_number
|
||||
let transceiverUuid: string | null = null;
|
||||
const uuidRegex = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i;
|
||||
if (uuidRegex.test(id)) {
|
||||
transceiverUuid = id;
|
||||
} else {
|
||||
const r = await pool.query(
|
||||
`SELECT id FROM transceivers WHERE part_number ILIKE $1 LIMIT 1`,
|
||||
[id]
|
||||
);
|
||||
if (r.rows.length > 0) transceiverUuid = r.rows[0].id;
|
||||
}
|
||||
|
||||
if (!transceiverUuid) {
|
||||
res.status(404).json({ success: false, error: "Transceiver not found" });
|
||||
return;
|
||||
}
|
||||
|
||||
const params: unknown[] = [transceiverUuid, days, limit];
|
||||
let vendorFilter = "";
|
||||
if (vendorId) {
|
||||
params.push(vendorId);
|
||||
vendorFilter = `AND so.source_vendor_id = $${params.length}`;
|
||||
}
|
||||
|
||||
const [transceiver, observations] = await Promise.all([
|
||||
pool.query(
|
||||
`SELECT t.*, v.name AS brand_name
|
||||
FROM transceivers t LEFT JOIN vendors v ON v.id = t.brand_vendor_id
|
||||
WHERE t.id = $1`,
|
||||
[transceiverUuid]
|
||||
),
|
||||
pool.query(
|
||||
`SELECT
|
||||
so.time,
|
||||
v.name AS vendor_name,
|
||||
v.website AS vendor_website,
|
||||
so.in_stock,
|
||||
so.quantity_available,
|
||||
so.warehouse_de_qty,
|
||||
so.warehouse_de_delivery_date,
|
||||
so.warehouse_global_qty,
|
||||
so.warehouse_global_delivery_date,
|
||||
so.backorder_qty,
|
||||
so.backorder_estimated_date,
|
||||
so.units_sold,
|
||||
so.compatible_brands,
|
||||
so.price_net,
|
||||
so.product_url
|
||||
FROM stock_observations so
|
||||
JOIN vendors v ON v.id = so.source_vendor_id
|
||||
WHERE so.transceiver_id = $1
|
||||
AND so.time >= NOW() - ($2 || ' days')::INTERVAL
|
||||
${vendorFilter}
|
||||
ORDER BY so.time DESC
|
||||
LIMIT $3`,
|
||||
params
|
||||
),
|
||||
]);
|
||||
|
||||
if (!transceiver.rows[0]) {
|
||||
res.status(404).json({ success: false, error: "Transceiver not found" });
|
||||
return;
|
||||
}
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
data: {
|
||||
transceiver: transceiver.rows[0],
|
||||
observations: observations.rows,
|
||||
meta: {
|
||||
count: observations.rows.length,
|
||||
days_requested: days,
|
||||
},
|
||||
},
|
||||
});
|
||||
} catch (err) {
|
||||
console.error("GET /api/stock/:id error:", err);
|
||||
res.status(500).json({ success: false, error: "Internal server error" });
|
||||
}
|
||||
});
|
||||
File diff suppressed because it is too large
Load Diff
@ -49,9 +49,6 @@ const QUEUES = [
|
||||
"scrape:pricing:skylane",
|
||||
"scrape:pricing:ascentoptics",
|
||||
"scrape:pricing:gaotek",
|
||||
"scrape:pricing:fibermall",
|
||||
"scrape:pricing:vcelink",
|
||||
"scrape:pricing:opticsbay",
|
||||
// Form-factor coverage scrapers
|
||||
"scrape:pricing:comms-express",
|
||||
"scrape:pricing:router-switch",
|
||||
@ -133,9 +130,6 @@ async function main() {
|
||||
const { scrapeSkylane } = await import("./scrapers/skylane");
|
||||
const { scrapeAscentOptics } = await import("./scrapers/ascentoptics");
|
||||
const { scrapeGaoTek } = await import("./scrapers/gaotek");
|
||||
const { scrapeFiberMall } = await import("./scrapers/fibermall");
|
||||
const { scrapeVcelink } = await import("./scrapers/vcelink");
|
||||
const { scrapeOpticsBay } = await import("./scrapers/opticsbay");
|
||||
|
||||
// ── Form-factor coverage scrapers ─────────────────────────────────────
|
||||
const { scrapeCommsExpress } = await import("./scrapers/comms-express");
|
||||
@ -194,9 +188,6 @@ async function main() {
|
||||
await boss.work("scrape:pricing:skylane", async () => { log("skylane"); await withIsolatedStorage("skylane", scrapeSkylane); });
|
||||
await boss.work("scrape:pricing:ascentoptics", async () => { log("ascentoptics"); await withIsolatedStorage("ascentoptics", scrapeAscentOptics); });
|
||||
await boss.work("scrape:pricing:gaotek", async () => { log("gaotek"); await withIsolatedStorage("gaotek", scrapeGaoTek); });
|
||||
await boss.work("scrape:pricing:fibermall", async () => { log("fibermall"); await scrapeFiberMall(); });
|
||||
await boss.work("scrape:pricing:vcelink", async () => { log("vcelink"); await scrapeVcelink(); });
|
||||
await boss.work("scrape:pricing:opticsbay", async () => { log("opticsbay"); await scrapeOpticsBay(); });
|
||||
|
||||
await boss.work("scrape:pricing:comms-express", async () => { log("comms-express"); await scrapeCommsExpress(); });
|
||||
await boss.work("scrape:pricing:router-switch", async () => { log("router-switch"); await scrapeRouterSwitch(); });
|
||||
@ -238,7 +229,7 @@ async function main() {
|
||||
await boss.work("compute:reorder-signals", async () => { log("reorder"); await computeReorderSignals(); });
|
||||
await boss.work("compute:forecast", async () => { log("forecast"); await runForecastEngine(); });
|
||||
|
||||
console.log(`${QUEUES.length} queues / workers active — running 24/7\n`);
|
||||
console.log(`${QUEUES.length} workers active — running 24/7\n`);
|
||||
process.on("SIGTERM", async () => { await boss.stop(); process.exit(0); });
|
||||
process.on("SIGINT", async () => { await boss.stop(); process.exit(0); });
|
||||
}
|
||||
|
||||
@ -127,9 +127,6 @@ export async function registerSchedules(boss: PgBoss): Promise<void> {
|
||||
"scrape:pricing:naddod",
|
||||
"scrape:pricing:qsfptek",
|
||||
"scrape:pricing:addon",
|
||||
"scrape:pricing:fibermall",
|
||||
"scrape:pricing:vcelink",
|
||||
"scrape:pricing:opticsbay",
|
||||
// ── Prediction Signal Scrapers (new) ──────────────────────────────
|
||||
"scrape:signals:sec-edgar",
|
||||
"scrape:signals:github",
|
||||
@ -183,9 +180,6 @@ export async function registerSchedules(boss: PgBoss): Promise<void> {
|
||||
await boss.schedule("scrape:pricing:naddod", "48 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 });
|
||||
await boss.schedule("scrape:pricing:qsfptek", "52 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 });
|
||||
await boss.schedule("scrape:pricing:addon", "55 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 });
|
||||
await boss.schedule("scrape:pricing:fibermall", "57 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 });
|
||||
await boss.schedule("scrape:pricing:vcelink", "3 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 });
|
||||
await boss.schedule("scrape:pricing:opticsbay", "7 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 });
|
||||
|
||||
// ══════════════════════════════════════════════════════════════════════
|
||||
// FLEXOPTIX CATALOG — every 2h (primary price source)
|
||||
@ -279,7 +273,7 @@ export async function registerSchedules(boss: PgBoss): Promise<void> {
|
||||
|
||||
await boss.schedule("sync:nas", "55 7 * * *", {}, { retryLimit: 1, expireInSeconds: 1800 });
|
||||
|
||||
console.log("All schedules registered — 24/7 continuous scraping (53 jobs)");
|
||||
console.log("All schedules registered — 24/7 continuous scraping (50 jobs)");
|
||||
}
|
||||
|
||||
export async function registerWorkers(boss: PgBoss): Promise<void> {
|
||||
@ -551,23 +545,5 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
|
||||
await scrapeAddonNetworks();
|
||||
});
|
||||
|
||||
await boss.work("scrape:pricing:fibermall", async () => {
|
||||
console.log(`[${new Date().toISOString()}] Running: FiberMall pricing`);
|
||||
const { scrapeFiberMall } = await import("./scrapers/fibermall");
|
||||
await scrapeFiberMall();
|
||||
});
|
||||
|
||||
await boss.work("scrape:pricing:vcelink", async () => {
|
||||
console.log(`[${new Date().toISOString()}] Running: Vcelink pricing`);
|
||||
const { scrapeVcelink } = await import("./scrapers/vcelink");
|
||||
await scrapeVcelink();
|
||||
});
|
||||
|
||||
await boss.work("scrape:pricing:opticsbay", async () => {
|
||||
console.log(`[${new Date().toISOString()}] Running: OpticsBay pricing`);
|
||||
const { scrapeOpticsBay } = await import("./scrapers/opticsbay");
|
||||
await scrapeOpticsBay();
|
||||
});
|
||||
|
||||
console.log("All workers registered (61 jobs, 24/7 continuous)");
|
||||
console.log("All workers registered (58 jobs, 24/7 continuous)");
|
||||
}
|
||||
|
||||
@ -1,265 +0,0 @@
|
||||
/**
|
||||
* FiberMall Scraper — Chinese compatible transceiver vendor
|
||||
*
|
||||
* fibermall.com — custom Vue.js/PHP shop, USD pricing.
|
||||
* Large catalog: 1G–800G, SFP/SFP+/QSFP28/QSFP-DD/OSFP.
|
||||
* Rate limited: 1 req/2sec.
|
||||
*
|
||||
* URL schema (discovered 2026-04-11):
|
||||
* Category pages: /store-XXXXX-name.htm
|
||||
* Product pages: /sale-XXXXXX-name.htm
|
||||
* Pagination: /store-XXXXX-name.htm?page=N
|
||||
* Product list: CSS class "new_proList_mainListLi"
|
||||
*/
|
||||
import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db";
|
||||
import { contentHash } from "../utils/hash";
|
||||
|
||||
const BASE = "https://www.fibermall.com";
|
||||
const HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
|
||||
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
Referer: "https://www.fibermall.com/",
|
||||
};
|
||||
|
||||
const MAX_PAGES = 30;
|
||||
|
||||
// Discovered via homepage navigation scrape 2026-04-11
|
||||
// Format: /store-XXXXX-description.htm
|
||||
const CATEGORIES = [
|
||||
{ path: "/store-17147-sfp-transceivers.htm", formFactor: "SFP", speed: "1G", speedGbps: 1 },
|
||||
{ path: "/store-17014-10g-sfp.htm", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
|
||||
{ path: "/store-17012-25g-sfp28.htm", formFactor: "SFP28", speed: "25G", speedGbps: 25 },
|
||||
{ path: "/store-16652-40g-qsfp.htm", formFactor: "QSFP+", speed: "40G", speedGbps: 40 },
|
||||
{ path: "/store-16528-100g-qsfp28.htm", formFactor: "QSFP28", speed: "100G", speedGbps: 100 },
|
||||
{ path: "/store-20654-200g-qsfp56-qsfp-dd.htm", formFactor: "QSFP56", speed: "200G", speedGbps: 200 },
|
||||
{ path: "/store-20656-400g-qsfp-dd.htm", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 },
|
||||
{ path: "/store-21972-800g-qsfp-dd-osfp.htm", formFactor: "OSFP", speed: "800G", speedGbps: 800 },
|
||||
{ path: "/store-16527-dac-aoc-acc-aec-cables.htm", formFactor: "DAC", speed: "10G", speedGbps: 10 },
|
||||
];
|
||||
|
||||
interface Product {
|
||||
partNumber: string;
|
||||
name: string;
|
||||
url: string;
|
||||
price?: number;
|
||||
formFactor: string;
|
||||
speed: string;
|
||||
speedGbps: number;
|
||||
reachLabel?: string;
|
||||
reachMeters?: number;
|
||||
fiberType?: string;
|
||||
wavelength?: string;
|
||||
}
|
||||
|
||||
function sleep(ms: number): Promise<void> {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
function detectReach(text: string): { label: string; meters: number } | undefined {
|
||||
const patterns: [RegExp, string, number][] = [
|
||||
[/\b120\s*km\b/i, "120km", 120000],
|
||||
[/\b80\s*km\b/i, "80km", 80000],
|
||||
[/\b40\s*km\b/i, "40km", 40000],
|
||||
[/\b20\s*km\b/i, "20km", 20000],
|
||||
[/\b10\s*km\b/i, "10km", 10000],
|
||||
[/\b2\s*km\b/i, "2km", 2000],
|
||||
[/\b550\s*m\b/i, "550m", 550],
|
||||
[/\b500\s*m\b/i, "500m", 500],
|
||||
[/\b300\s*m\b/i, "300m", 300],
|
||||
[/\b150\s*m\b/i, "150m", 150],
|
||||
[/\b100\s*m\b/i, "100m", 100],
|
||||
[/\bLR4\b/, "10km", 10000], [/\bLR\b/, "10km", 10000],
|
||||
[/\bER4?\b/, "40km", 40000], [/\bZR4?\b/, "80km", 80000],
|
||||
[/\bSR4?\b/, "300m", 300], [/\bDR4?\b/, "500m", 500],
|
||||
[/\bFR4?\b/, "2km", 2000],
|
||||
];
|
||||
for (const [re, label, meters] of patterns) {
|
||||
if (re.test(text)) return { label, meters };
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function detectFiber(text: string): string {
|
||||
if (/single.?mode|smf|[^a-z]lx[^a-z]|[^a-z]lr[^a-z]|[^a-z]er[^a-z]|[^a-z]zr[^a-z]|bidi|cwdm|dwdm/i.test(text)) return "SMF";
|
||||
if (/multi.?mode|mmf|[^a-z]sx[^a-z]|[^a-z]sr[^a-z]/i.test(text)) return "MMF";
|
||||
if (/copper|dac|twinax|rj.?45|base-t/i.test(text)) return "Copper";
|
||||
return "";
|
||||
}
|
||||
|
||||
function detectWavelength(text: string): string {
|
||||
const m = text.match(/(\d{3,4})\s*nm/i);
|
||||
return m ? m[1] : "";
|
||||
}
|
||||
|
||||
function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product[] {
|
||||
const products: Product[] = [];
|
||||
const seen = new Set<string>();
|
||||
const collapsed = html.replace(/\s+/g, " ");
|
||||
|
||||
// FiberMall HTML structure (SSR, confirmed 2026-04-11):
|
||||
// <li class="new_proList_mainListLi">
|
||||
// <a href="/sale-XXXXXX-name.htm" title="Full Name">...</a>
|
||||
// <span class="currency_price" data-price="12.00">12.00</span>
|
||||
// </li>
|
||||
//
|
||||
// Each <li> is a product GROUP with SKU variants inside .sku_item divs.
|
||||
// The main product URL has a `title` attribute; sku variants do not.
|
||||
// Price: data-price="X.XX" on <span class="currency_price">
|
||||
|
||||
// Split by <li class="new_proList_mainListLi"
|
||||
const liParts = collapsed.split(/(?=<li class="new_proList_mainListLi")/);
|
||||
|
||||
for (const card of liParts) {
|
||||
if (!card.includes('new_proList_mainListLi')) continue;
|
||||
|
||||
// Price: from <span class="currency_price" data-price="X.XX">
|
||||
// Must target specifically currency_price span — SKU items have data-price="0.00"
|
||||
const priceM = card.match(/class="currency_price"[^>]*data-price="([\d.]+)"/i) ||
|
||||
card.match(/data-price="([1-9][\d]*\.?\d{0,2})"/); // skip 0.00
|
||||
const price = priceM ? parseFloat(priceM[1]) : undefined;
|
||||
|
||||
// Main product link: first <a href="/sale-..."> with title attribute
|
||||
const mainLinkM = card.match(/href="(\/sale-\d+[^"?#]*\.htm)"[^>]*title="([^"]{8,})"/i);
|
||||
if (mainLinkM) {
|
||||
const url = BASE + mainLinkM[1];
|
||||
const name = mainLinkM[2].trim().replace(/&/g, "&").replace(/&#\d+;/g, "").replace(/\s+/g, " ");
|
||||
if (!seen.has(url) && name.length >= 5) {
|
||||
seen.add(url);
|
||||
const reach = detectReach(name);
|
||||
const partNumber = name.split(/\s+(?:compatible|for\s+[A-Z])/i)[0]?.trim().slice(0, 80) || name.slice(0, 60);
|
||||
products.push({
|
||||
partNumber, name, url,
|
||||
price: price && price > 0 && price < 100000 ? price : undefined,
|
||||
formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps,
|
||||
reachLabel: reach?.label, reachMeters: reach?.meters,
|
||||
fiberType: detectFiber(name), wavelength: detectWavelength(name),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Also collect SKU variant links from .sku_item (no title attribute, use link text)
|
||||
for (const m of card.matchAll(/class="sku_item[^"]*"[^>]*>\s*<a href="(\/sale-\d+[^"?#]*\.htm)"[^>]*>([^<]{5,})<\/a>/gi)) {
|
||||
const url = BASE + m[1];
|
||||
const name = m[2].trim().replace(/&/g, "&");
|
||||
if (seen.has(url) || name.length < 4) continue;
|
||||
seen.add(url);
|
||||
const reach = detectReach(name);
|
||||
products.push({
|
||||
partNumber: name.slice(0, 80),
|
||||
name, url,
|
||||
price: price && price > 0 && price < 100000 ? price : undefined,
|
||||
formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps,
|
||||
reachLabel: reach?.label, reachMeters: reach?.meters,
|
||||
fiberType: detectFiber(name), wavelength: detectWavelength(name),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return products;
|
||||
}
|
||||
|
||||
async function fetchPage(url: string): Promise<string> {
|
||||
const resp = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(30000) });
|
||||
if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`);
|
||||
return resp.text();
|
||||
}
|
||||
|
||||
export async function scrapeFiberMall(): Promise<void> {
|
||||
console.log("=== FiberMall Scraper Starting ===\n");
|
||||
|
||||
const vendorId = await ensureVendor(
|
||||
"FiberMall",
|
||||
"compatible",
|
||||
"https://www.fibermall.com",
|
||||
"https://www.fibermall.com/store-16528-100g-qsfp28.htm",
|
||||
);
|
||||
|
||||
let totalProducts = 0;
|
||||
let priceUpdates = 0;
|
||||
const seenCategories = new Set<string>();
|
||||
|
||||
for (const cat of CATEGORIES) {
|
||||
console.log(`\n--- ${cat.formFactor} (${cat.speed}) [${cat.path}] ---`);
|
||||
|
||||
try {
|
||||
const html1 = await fetchPage(BASE + cat.path);
|
||||
const catProducts = parseProductList(html1, cat);
|
||||
|
||||
if (catProducts.length === 0) {
|
||||
console.log(" No products found — skipping");
|
||||
continue;
|
||||
}
|
||||
|
||||
seenCategories.add(cat.path);
|
||||
console.log(` Found ${catProducts.length} products on page 1`);
|
||||
|
||||
const allProducts = [...catProducts];
|
||||
|
||||
for (let page = 2; page <= MAX_PAGES; page++) {
|
||||
await sleep(2000);
|
||||
try {
|
||||
// FiberMall pagination: ?page=N
|
||||
const pageUrl = `${BASE}${cat.path}?page=${page}`;
|
||||
const html = await fetchPage(pageUrl);
|
||||
const pageProds = parseProductList(html, cat);
|
||||
if (pageProds.length === 0) break;
|
||||
allProducts.push(...pageProds);
|
||||
console.log(` Page ${page}: ${pageProds.length} products`);
|
||||
} catch (err) {
|
||||
console.warn(` Page ${page} failed: ${(err as Error).message.slice(0, 60)}`);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
const uniqueProducts = allProducts.filter((p, i, arr) => arr.findIndex(x => x.url === p.url) === i);
|
||||
console.log(` Total unique: ${uniqueProducts.length}`);
|
||||
|
||||
for (const product of uniqueProducts) {
|
||||
try {
|
||||
const txId = await findOrCreateScrapedTransceiver({
|
||||
partNumber: product.partNumber,
|
||||
vendorId,
|
||||
formFactor: product.formFactor,
|
||||
speedGbps: product.speedGbps,
|
||||
speed: product.speed,
|
||||
reachMeters: product.reachMeters,
|
||||
reachLabel: product.reachLabel,
|
||||
fiberType: product.fiberType,
|
||||
wavelengths: product.wavelength,
|
||||
category: "DataCenter",
|
||||
});
|
||||
|
||||
if (product.price && product.price > 0) {
|
||||
const hash = contentHash({ price: product.price, part: product.partNumber });
|
||||
const updated = await upsertPriceObservation({
|
||||
transceiverId: txId,
|
||||
sourceVendorId: vendorId,
|
||||
price: product.price,
|
||||
currency: "USD",
|
||||
stockLevel: "in_stock",
|
||||
url: product.url,
|
||||
contentHash: hash,
|
||||
});
|
||||
if (updated) priceUpdates++;
|
||||
}
|
||||
totalProducts++;
|
||||
} catch (err) {
|
||||
console.warn(` DB error: ${(err as Error).message.slice(0, 80)}`);
|
||||
}
|
||||
}
|
||||
} catch (err) {
|
||||
console.error(` Category failed: ${(err as Error).message}`);
|
||||
}
|
||||
|
||||
await sleep(2000);
|
||||
}
|
||||
|
||||
console.log(`\n=== FiberMall Complete: ${totalProducts} products, ${priceUpdates} price updates ===`);
|
||||
}
|
||||
|
||||
if (require.main === module) {
|
||||
scrapeFiberMall()
|
||||
.then(() => pool.end())
|
||||
.catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); });
|
||||
}
|
||||
@ -72,10 +72,6 @@ const SEARCH_QUERIES = [
|
||||
{ query: "OSFP LR4", formFactor: "OSFP", speed: "400G", speedGbps: 400 },
|
||||
{ query: "OSFP ZR", formFactor: "OSFP", speed: "400G", speedGbps: 400 },
|
||||
{ query: "OSFP 800G", formFactor: "OSFP", speed: "800G", speedGbps: 800 },
|
||||
{ query: "OSFP224 1.6T", formFactor: "OSFP224", speed: "1.6T", speedGbps: 1600 },
|
||||
{ query: "OSFP224", formFactor: "OSFP224", speed: "1.6T", speedGbps: 1600 },
|
||||
{ query: "1.6T DR4", formFactor: "OSFP224", speed: "1.6T", speedGbps: 1600 },
|
||||
{ query: "1.6T transceiver", formFactor: "OSFP224", speed: "1.6T", speedGbps: 1600 },
|
||||
// Additional granular queries for maximum coverage
|
||||
{ query: "SFP+ copper", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
|
||||
{ query: "SFP+ 10GBASE-T", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
|
||||
@ -270,13 +266,11 @@ async function searchProducts(query: string): Promise<SearchResult[]> {
|
||||
// Try data-price-amount attribute first (Magento Hyva theme)
|
||||
const attrMatch = s.match(/data-price-amount="([\d.]+)"/);
|
||||
if (attrMatch) return attrMatch[1];
|
||||
// Try plain price text like "2,921.60 EUR" or "39.64 EUR"
|
||||
// IMPORTANT: must include comma in char class to handle thousand separators
|
||||
const textMatch = s.match(/([\d,]+\.?\d*)\s*EUR/i);
|
||||
if (textMatch) return textMatch[1].replace(/,/g, "");
|
||||
// Try bare number (strip thousand-separator commas first)
|
||||
const cleaned = s.replace(/,/g, "");
|
||||
const num = parseFloat(cleaned);
|
||||
// Try plain price text like "39.64 EUR"
|
||||
const textMatch = s.match(/([\d.]+)\s*EUR/i);
|
||||
if (textMatch) return textMatch[1];
|
||||
// Try bare number
|
||||
const num = parseFloat(s);
|
||||
if (!isNaN(num) && num > 0) return String(num);
|
||||
return undefined;
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -1,69 +1,56 @@
|
||||
/**
|
||||
* NADDOD Scraper v2 — Chinese compatible transceiver vendor
|
||||
* NADDOD Scraper — Chinese compatible transceiver vendor
|
||||
*
|
||||
* naddod.com — Migrated from WooCommerce → Astro/Shopify-style in 2025.
|
||||
* Product URLs: /products/XXXXX.html (numeric IDs, not category slugs)
|
||||
* Sitemap: /sitemaps/products.xml
|
||||
* naddod.com — WooCommerce store, server-rendered HTML, USD pricing.
|
||||
* Products listed under product category pages.
|
||||
* Pagination via /page/N/. Rate limited: 1 req/2sec.
|
||||
*
|
||||
* Phase 1: Parse sitemap to collect all product URLs (plain HTTP)
|
||||
* Phase 2: Fetch product detail pages — extract name, price, stock count
|
||||
* Stock format: "In Stock: 543" (exact) | "In Stock: 2.1k+" (rounded) | "In Stock: Available" (boolean)
|
||||
* Per-warehouse JSON (warehouse_stock: {us, nl, sg, cn}) is in a JS hydration
|
||||
* payload that requires JS execution — only the display count is in plain HTML.
|
||||
* → stock_confidence=2 (aggregated global count) for exact/rounded counts
|
||||
* → stock_confidence=1 (boolean) for "Available" only
|
||||
*
|
||||
* Rate limited: 1 req/2sec.
|
||||
* NADDOD (Shenzhen NADDOD Information Co.) makes and sells compatible
|
||||
* optics for Cisco, Juniper, Arista, etc. Transparent USD pricing.
|
||||
*/
|
||||
import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation, upsertStockObservation } from "../utils/db";
|
||||
import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db";
|
||||
import { contentHash } from "../utils/hash";
|
||||
|
||||
const BASE = "https://www.naddod.com";
|
||||
const SITEMAP_URL = `${BASE}/sitemaps/products.xml`;
|
||||
const HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (compatible; TIP-Bot/1.0; research)",
|
||||
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
};
|
||||
|
||||
// Limit detail-page fetches per run to stay reasonable
|
||||
const MAX_DETAIL_PAGES = 600;
|
||||
const MAX_PAGES = 30;
|
||||
|
||||
const CATEGORIES = [
|
||||
{ path: "/product-category/1g-sfp-transceivers/", formFactor: "SFP", speed: "1G", speedGbps: 1 },
|
||||
{ path: "/product-category/10g-sfp-transceivers/", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
|
||||
{ path: "/product-category/25g-sfp28-transceivers/", formFactor: "SFP28", speed: "25G", speedGbps: 25 },
|
||||
{ path: "/product-category/40g-qsfp-transceivers/", formFactor: "QSFP+", speed: "40G", speedGbps: 40 },
|
||||
{ path: "/product-category/100g-qsfp28-transceivers/", formFactor: "QSFP28", speed: "100G", speedGbps: 100 },
|
||||
{ path: "/product-category/200g-qsfp56-transceivers/", formFactor: "QSFP56", speed: "200G", speedGbps: 200 },
|
||||
{ path: "/product-category/400g-qsfp-dd-transceivers/", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 },
|
||||
{ path: "/product-category/800g-osfp-transceivers/", formFactor: "OSFP", speed: "800G", speedGbps: 800 },
|
||||
{ path: "/product-category/transceivers/", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
|
||||
];
|
||||
|
||||
interface Product {
|
||||
partNumber: string;
|
||||
name: string;
|
||||
url: string;
|
||||
price?: number;
|
||||
formFactor: string;
|
||||
speed: string;
|
||||
speedGbps: number;
|
||||
reachLabel?: string;
|
||||
reachMeters?: number;
|
||||
fiberType?: string;
|
||||
wavelength?: string;
|
||||
compatibleWith?: string;
|
||||
}
|
||||
|
||||
function sleep(ms: number): Promise<void> {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
// ── Classification helpers ──────────────────────────────────────────────────
|
||||
|
||||
function detectFormFactor(text: string): string {
|
||||
const t = text.toLowerCase();
|
||||
if (/\bosfp\b/.test(t)) return "OSFP";
|
||||
if (/\bqsfp.?dd800\b|\bqsfp.?dd\s+800\b/.test(t)) return "QSFP-DD800";
|
||||
if (/\bqsfp.?dd\b/.test(t)) return "QSFP-DD";
|
||||
if (/\bqsfp56\b/.test(t)) return "QSFP56";
|
||||
if (/\bqsfp112\b/.test(t)) return "QSFP112";
|
||||
if (/\bqsfp28\b/.test(t)) return "QSFP28";
|
||||
if (/\bqsfp\+|\bqsfp\s*plus\b/.test(t)) return "QSFP+";
|
||||
if (/\bsfp28\b/.test(t)) return "SFP28";
|
||||
if (/\bsfp.?\+|10g.*sfp|sfp.*10g/.test(t)) return "SFP+";
|
||||
if (/\bsfp\b/.test(t)) return "SFP";
|
||||
if (/\bxfp\b/.test(t)) return "XFP";
|
||||
return "SFP+"; // default
|
||||
}
|
||||
|
||||
function detectSpeedGbps(text: string): { speed: string; speedGbps: number } {
|
||||
const t = text.toUpperCase();
|
||||
if (/\b800G\b|\b800GBE\b/.test(t)) return { speed: "800G", speedGbps: 800 };
|
||||
if (/\b400G\b|\b400GBE\b/.test(t)) return { speed: "400G", speedGbps: 400 };
|
||||
if (/\b200G\b|\b200GBE\b/.test(t)) return { speed: "200G", speedGbps: 200 };
|
||||
if (/\b100G\b|\b100GBE\b/.test(t)) return { speed: "100G", speedGbps: 100 };
|
||||
if (/\b40G\b|\b40GBE\b/.test(t)) return { speed: "40G", speedGbps: 40 };
|
||||
if (/\b25G\b|\b25GBE\b/.test(t)) return { speed: "25G", speedGbps: 25 };
|
||||
if (/\b10G\b|\b10GBE\b/.test(t)) return { speed: "10G", speedGbps: 10 };
|
||||
if (/\b1G\b|\b1GBE\b|\bGIGABIT\b/.test(t)) return { speed: "1G", speedGbps: 1 };
|
||||
return { speed: "Unknown", speedGbps: 0 };
|
||||
}
|
||||
|
||||
function detectReach(text: string): { label: string; meters: number } | undefined {
|
||||
const patterns: [RegExp, string, number][] = [
|
||||
[/\b120\s*km\b/i, "120km", 120000],
|
||||
@ -78,13 +65,16 @@ function detectReach(text: string): { label: string; meters: number } | undefine
|
||||
[/\b300\s*m\b/i, "300m", 300],
|
||||
[/\b150\s*m\b/i, "150m", 150],
|
||||
[/\b100\s*m\b/i, "100m", 100],
|
||||
[/\bLR4\b/, "10km", 10000], [/\bLR\b/, "10km", 10000],
|
||||
[/\bER4?\b/, "40km", 40000], [/\bZR4?\b/, "80km", 80000],
|
||||
[/\bSR4?\b/, "300m", 300], [/\bDR4?\b/, "500m", 500],
|
||||
[/\bLR4\b/, "10km", 10000],
|
||||
[/\bLR\b/, "10km", 10000],
|
||||
[/\bER4?\b/, "40km", 40000],
|
||||
[/\bZR4?\b/, "80km", 80000],
|
||||
[/\bSR4?\b/, "300m", 300],
|
||||
[/\bDR4?\b/, "500m", 500],
|
||||
[/\bFR4?\b/, "2km", 2000],
|
||||
];
|
||||
for (const [re, label, meters] of patterns) {
|
||||
if (re.test(text)) return { label, meters };
|
||||
for (const [regex, label, meters] of patterns) {
|
||||
if (regex.test(text)) return { label, meters };
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
@ -97,230 +87,195 @@ function detectFiber(text: string): string {
|
||||
}
|
||||
|
||||
function detectWavelength(text: string): string {
|
||||
const m = text.match(/(\d{3,4})\s*nm/i);
|
||||
return m ? m[1] : "";
|
||||
const match = text.match(/(\d{3,4})\s*nm/i);
|
||||
return match ? match[1] : "";
|
||||
}
|
||||
|
||||
function isTransceiver(name: string): boolean {
|
||||
const lower = name.toLowerCase();
|
||||
// Include: SFP, QSFP, OSFP, XFP, DAC/AOC cables count as transceivers for stock tracking
|
||||
return /sfp|qsfp|osfp|xfp|transceiver|dac|aoc|cwdm|dwdm/i.test(lower);
|
||||
function extractCompatibleVendor(name: string): string {
|
||||
const brands = ["Cisco", "Juniper", "Arista", "HPE", "Dell", "Brocade", "Extreme", "Huawei",
|
||||
"Nokia", "MikroTik", "Mellanox", "Nvidia", "Ubiquiti"];
|
||||
for (const brand of brands) {
|
||||
if (new RegExp(`\\b${brand}\\b`, "i").test(name)) return brand;
|
||||
}
|
||||
const match = name.match(/(?:for\s+|compatible\s+(?:with\s+)?)([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)/);
|
||||
return match ? match[1] : "";
|
||||
}
|
||||
|
||||
// ── Stock parsing ───────────────────────────────────────────────────────────
|
||||
function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product[] {
|
||||
const products: Product[] = [];
|
||||
const seen = new Set<string>();
|
||||
const collapsed = html.replace(/\s+/g, " ");
|
||||
|
||||
/**
|
||||
* Parse NADDOD stock display text.
|
||||
* "In Stock: 543" → { qty: 543, confidence: 2 }
|
||||
* "In Stock: 2.1k+" → { qty: 2100, confidence: 2 } (approximate, rounded)
|
||||
* "In Stock: Available" → { qty: undefined, confidence: 1 }
|
||||
* Returns null if no stock text found.
|
||||
*/
|
||||
function parseStockText(html: string): { qty?: number; confidence: 1 | 2 } | null {
|
||||
// Look for "In Stock: X" pattern in page text
|
||||
const m = html.match(/In\s+Stock[:\s]+([^\s<"]+)/i);
|
||||
if (!m) return null;
|
||||
// Strategy 1: WooCommerce standard product loop
|
||||
const cardRegex = /<li[^>]+class="[^"]*product[^"]*"[^>]*>([\s\S]*?)<\/li>/gi;
|
||||
let cardMatch;
|
||||
while ((cardMatch = cardRegex.exec(collapsed)) !== null) {
|
||||
const card = cardMatch[1];
|
||||
|
||||
const raw = m[1].trim().toLowerCase();
|
||||
const urlMatch = card.match(/href="(https?:\/\/(?:www\.)?naddod\.com\/product\/[^"]+)"/i);
|
||||
if (!urlMatch) continue;
|
||||
const url = urlMatch[1];
|
||||
if (seen.has(url)) continue;
|
||||
seen.add(url);
|
||||
|
||||
// "Available" = boolean only
|
||||
if (/^avail/i.test(raw)) return { confidence: 1 };
|
||||
const nameMatch = card.match(/woocommerce-loop-product__title[^>]*>([^<]+)</i) ||
|
||||
card.match(/<h2[^>]*>([^<]{10,})<\/h2>/i) ||
|
||||
card.match(/<h3[^>]*>([^<]{10,})<\/h3>/i);
|
||||
if (!nameMatch) continue;
|
||||
const name = nameMatch[1].trim().replace(/&/g, "&").replace(/–/g, "–");
|
||||
if (name.length < 5) continue;
|
||||
|
||||
// Numeric: "543" or "2.1k+" or "1.5k+"
|
||||
const kMatch = raw.match(/^([\d.]+)k\+?$/);
|
||||
if (kMatch) {
|
||||
const qty = Math.round(parseFloat(kMatch[1]) * 1000);
|
||||
return { qty: isNaN(qty) ? undefined : qty, confidence: 2 };
|
||||
const priceMatch = card.match(/\$\s*([\d,]+\.?\d*)/);
|
||||
const price = priceMatch ? parseFloat(priceMatch[1].replace(/,/g, "")) : undefined;
|
||||
|
||||
const reach = detectReach(name);
|
||||
const partNumber = name.split(/\s+(?:compatible|for|sfp|qsfp)/i)[0]?.trim().slice(0, 80) || name.slice(0, 60);
|
||||
|
||||
products.push({
|
||||
partNumber, name, url,
|
||||
price: price && price > 0 && price < 100000 ? price : undefined,
|
||||
formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps,
|
||||
reachLabel: reach?.label, reachMeters: reach?.meters,
|
||||
fiberType: detectFiber(name), wavelength: detectWavelength(name),
|
||||
compatibleWith: extractCompatibleVendor(name),
|
||||
});
|
||||
}
|
||||
|
||||
const exact = parseInt(raw.replace(/[^0-9]/g, ""), 10);
|
||||
if (!isNaN(exact) && exact >= 0) return { qty: exact, confidence: 2 };
|
||||
// Strategy 2: Generic product link fallback
|
||||
if (products.length === 0) {
|
||||
const linkRegex = /href="(https?:\/\/(?:www\.)?naddod\.com\/(?:product|shop)\/[^"?#]+)"[^>]*>\s*([^<]{10,})/gi;
|
||||
let m;
|
||||
while ((m = linkRegex.exec(collapsed)) !== null) {
|
||||
const url = m[1];
|
||||
const name = m[2].trim().replace(/&/g, "&");
|
||||
if (seen.has(url) || name.length < 10) continue;
|
||||
if (!/transceiver|sfp|qsfp|osfp|dac|aoc|xfp/i.test(name)) continue;
|
||||
seen.add(url);
|
||||
|
||||
return { confidence: 1 }; // fallback: boolean
|
||||
const ctx = collapsed.slice(Math.max(0, m.index - 200), m.index + 500);
|
||||
const priceM = ctx.match(/\$\s*([\d,]+\.?\d*)/);
|
||||
const price = priceM ? parseFloat(priceM[1].replace(/,/g, "")) : undefined;
|
||||
const reach = detectReach(name);
|
||||
|
||||
products.push({
|
||||
partNumber: name.split(/\s+/)[0]?.slice(0, 80) || "",
|
||||
name, url,
|
||||
price: price && price > 0 && price < 100000 ? price : undefined,
|
||||
formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps,
|
||||
reachLabel: reach?.label, reachMeters: reach?.meters,
|
||||
fiberType: detectFiber(name), wavelength: detectWavelength(name),
|
||||
compatibleWith: extractCompatibleVendor(name),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// ── HTTP helpers ────────────────────────────────────────────────────────────
|
||||
return products;
|
||||
}
|
||||
|
||||
async function fetchText(url: string): Promise<string> {
|
||||
async function fetchPage(url: string): Promise<string> {
|
||||
const resp = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(30000) });
|
||||
if (!resp.ok) throw new Error(`HTTP ${resp.status}`);
|
||||
if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`);
|
||||
return resp.text();
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse product detail page HTML → extract name, price, stock.
|
||||
*/
|
||||
function parseDetailPage(html: string, url: string): {
|
||||
name: string;
|
||||
price?: number;
|
||||
stock: { qty?: number; confidence: 1 | 2 } | null;
|
||||
} | null {
|
||||
// Product name: og:title or <title> or <h1>
|
||||
const ogTitle = html.match(/<meta\s+property="og:title"\s+content="([^"]+)"/i)?.[1];
|
||||
const h1 = html.match(/<h1[^>]*>([^<]{15,})<\/h1>/i)?.[1]?.trim();
|
||||
const titleTag = html.match(/<title[^>]*>([^<]+)<\/title>/i)?.[1]?.trim();
|
||||
const name = (ogTitle || h1 || titleTag || "").replace(/\s+/g, " ").slice(0, 200);
|
||||
|
||||
if (!name || name.length < 10) return null;
|
||||
if (!isTransceiver(name)) return null;
|
||||
|
||||
// Price: "US$ 10.90" or "$10.90"
|
||||
const priceMatch = html.match(/US\$\s*([\d,]+\.?\d{0,2})/i) ||
|
||||
html.match(/\$\s*([\d,]+\.\d{2})\b/);
|
||||
const price = priceMatch ? parseFloat(priceMatch[1].replace(/,/g, "")) : undefined;
|
||||
|
||||
// Stock count
|
||||
const stock = parseStockText(html);
|
||||
|
||||
return { name, price: price && price > 0 && price < 500000 ? price : undefined, stock };
|
||||
}
|
||||
|
||||
// ── Sitemap parsing ─────────────────────────────────────────────────────────
|
||||
|
||||
async function fetchProductUrlsFromSitemap(): Promise<string[]> {
|
||||
console.log(` Fetching sitemap: ${SITEMAP_URL}`);
|
||||
const xml = await fetchText(SITEMAP_URL);
|
||||
|
||||
// Extract all <loc> URLs that match /products/XXXXX.html
|
||||
const urls: string[] = [];
|
||||
const locRegex = /<loc>([^<]+\/products\/\d+\.html)<\/loc>/gi;
|
||||
let m: RegExpExecArray | null;
|
||||
while ((m = locRegex.exec(xml)) !== null) {
|
||||
const url = m[1].trim();
|
||||
// Keep only canonical English URLs (no language prefix)
|
||||
if (!url.includes("/en/") && !url.includes("/de/") && !url.includes("/fr/")) {
|
||||
urls.push(url);
|
||||
}
|
||||
}
|
||||
|
||||
return [...new Set(urls)]; // deduplicate
|
||||
}
|
||||
|
||||
// ── Main scraper ────────────────────────────────────────────────────────────
|
||||
|
||||
export async function scrapeNaddod(): Promise<void> {
|
||||
console.log("=== NADDOD Scraper v2 Starting (sitemap + detail mode) ===\n");
|
||||
console.log("=== NADDOD Scraper Starting ===\n");
|
||||
|
||||
const vendorId = await ensureVendor(
|
||||
"NADDOD",
|
||||
"compatible",
|
||||
"https://www.naddod.com",
|
||||
"https://www.naddod.com/collections/transceivers",
|
||||
"https://www.naddod.com/product-category/transceivers/",
|
||||
);
|
||||
|
||||
// ── Phase 1: Discover product URLs via sitemap ────────────────────────────
|
||||
console.log("[Phase 1] Discovering products from sitemap...");
|
||||
let productUrls: string[] = [];
|
||||
try {
|
||||
productUrls = await fetchProductUrlsFromSitemap();
|
||||
console.log(` Found ${productUrls.length} product URLs in sitemap`);
|
||||
} catch (err) {
|
||||
console.error(` Sitemap fetch failed: ${(err as Error).message}`);
|
||||
return;
|
||||
}
|
||||
|
||||
if (productUrls.length === 0) {
|
||||
console.warn(" No product URLs found — aborting");
|
||||
return;
|
||||
}
|
||||
|
||||
// Limit to avoid excessive runtime
|
||||
const urls = productUrls.slice(0, MAX_DETAIL_PAGES);
|
||||
console.log(` Processing ${urls.length} products (limit: ${MAX_DETAIL_PAGES})`);
|
||||
|
||||
// ── Phase 2: Fetch detail pages + write to DB ─────────────────────────────
|
||||
console.log("\n[Phase 2] Fetching product detail pages...");
|
||||
|
||||
let processed = 0;
|
||||
let totalProducts = 0;
|
||||
let priceUpdates = 0;
|
||||
let stockWritten = 0;
|
||||
let stockSkipped = 0;
|
||||
let skippedNonTx = 0;
|
||||
let errors = 0;
|
||||
const seenCategories = new Set<string>();
|
||||
|
||||
for (const cat of CATEGORIES) {
|
||||
console.log(`\n--- ${cat.formFactor} (${cat.speed}) [${cat.path}] ---`);
|
||||
|
||||
for (const url of urls) {
|
||||
await sleep(2000);
|
||||
try {
|
||||
const html = await fetchText(url);
|
||||
const detail = parseDetailPage(html, url);
|
||||
const html1 = await fetchPage(BASE + cat.path);
|
||||
const catProducts = parseProductList(html1, cat);
|
||||
|
||||
if (!detail) {
|
||||
skippedNonTx++;
|
||||
if (cat.path.includes("/transceivers/") && seenCategories.size > 3) {
|
||||
console.log(` Skipping generic fallback (${seenCategories.size} specific categories scraped)`);
|
||||
continue;
|
||||
}
|
||||
|
||||
const { name, price, stock } = detail;
|
||||
const { speed, speedGbps } = detectSpeedGbps(name);
|
||||
const formFactor = detectFormFactor(name);
|
||||
const reach = detectReach(name);
|
||||
const fiberType = detectFiber(name);
|
||||
const wavelength = detectWavelength(name);
|
||||
if (catProducts.length === 0) {
|
||||
console.log(" No products on page 1 — skipping");
|
||||
continue;
|
||||
}
|
||||
|
||||
// Extract part number from name (first word-group before "Compatible" or vendor name)
|
||||
const partNumber = name.split(/\s+(?:compatible|for\s+[A-Z]|sfp|qsfp)/i)[0]?.trim().slice(0, 80) || name.slice(0, 60);
|
||||
seenCategories.add(cat.path);
|
||||
console.log(` Found ${catProducts.length} products on page 1`);
|
||||
|
||||
const totalPagesMatch = html1.match(/page-numbers[^>]*>(\d+)<\/a>(?!.*page-numbers)/);
|
||||
const totalPages = totalPagesMatch ? Math.min(parseInt(totalPagesMatch[1]), MAX_PAGES) : 1;
|
||||
console.log(` Total pages: ${totalPages}`);
|
||||
|
||||
const allProducts = [...catProducts];
|
||||
|
||||
for (let page = 2; page <= totalPages; page++) {
|
||||
await sleep(2000);
|
||||
try {
|
||||
const html = await fetchPage(BASE + cat.path + `page/${page}/`);
|
||||
const pageProds = parseProductList(html, cat);
|
||||
if (pageProds.length === 0) break;
|
||||
allProducts.push(...pageProds);
|
||||
console.log(` Page ${page}: ${pageProds.length} products`);
|
||||
} catch (err) {
|
||||
console.warn(` Page ${page} failed: ${(err as Error).message.slice(0, 60)}`);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
const uniqueProducts = allProducts.filter((p, i, arr) => arr.findIndex((x) => x.url === p.url) === i);
|
||||
console.log(` Total unique: ${uniqueProducts.length}`);
|
||||
|
||||
for (const product of uniqueProducts) {
|
||||
try {
|
||||
const txId = await findOrCreateScrapedTransceiver({
|
||||
partNumber,
|
||||
partNumber: product.partNumber,
|
||||
vendorId,
|
||||
formFactor,
|
||||
speedGbps,
|
||||
speed,
|
||||
reachMeters: reach?.meters,
|
||||
reachLabel: reach?.label,
|
||||
fiberType,
|
||||
wavelengths: wavelength,
|
||||
formFactor: product.formFactor,
|
||||
speedGbps: product.speedGbps,
|
||||
speed: product.speed,
|
||||
reachMeters: product.reachMeters,
|
||||
reachLabel: product.reachLabel,
|
||||
fiberType: product.fiberType,
|
||||
wavelengths: product.wavelength,
|
||||
category: "DataCenter",
|
||||
});
|
||||
|
||||
// Price observation
|
||||
if (price && price > 0) {
|
||||
const hash = contentHash({ price, part: partNumber });
|
||||
const isNew = await upsertPriceObservation({
|
||||
if (product.price && product.price > 0) {
|
||||
const hash = contentHash({ price: product.price, part: product.partNumber });
|
||||
const updated = await upsertPriceObservation({
|
||||
transceiverId: txId,
|
||||
sourceVendorId: vendorId,
|
||||
price,
|
||||
price: product.price,
|
||||
currency: "USD",
|
||||
stockLevel: stock?.qty !== undefined && stock.qty > 0 ? "in_stock" :
|
||||
stock?.confidence === 1 ? "in_stock" : "unknown",
|
||||
url,
|
||||
stockLevel: "in_stock",
|
||||
url: product.url,
|
||||
contentHash: hash,
|
||||
});
|
||||
if (isNew) priceUpdates++;
|
||||
if (updated) priceUpdates++;
|
||||
}
|
||||
|
||||
// Stock observation
|
||||
if (stock !== null) {
|
||||
const stockLevel = stock.qty !== undefined ? (stock.qty > 0 ? "in_stock" : "out_of_stock") : "in_stock";
|
||||
const isNew = await upsertStockObservation({
|
||||
transceiverId: txId,
|
||||
sourceVendorId: vendorId,
|
||||
stockLevel,
|
||||
quantityAvailable: stock.qty !== undefined && stock.qty > 0 ? stock.qty : undefined,
|
||||
productUrl: url,
|
||||
stockConfidence: stock.confidence,
|
||||
priceCurrency: "USD",
|
||||
priceIncludesTax: false,
|
||||
});
|
||||
if (isNew) stockWritten++;
|
||||
else stockSkipped++;
|
||||
totalProducts++;
|
||||
} catch (err) {
|
||||
console.warn(` DB error: ${(err as Error).message.slice(0, 80)}`);
|
||||
}
|
||||
|
||||
processed++;
|
||||
if (processed % 50 === 0) {
|
||||
console.log(` Progress: ${processed}/${urls.length} | prices: ${priceUpdates} | stock: ${stockWritten} new / ${stockSkipped} unchanged`);
|
||||
}
|
||||
} catch (err) {
|
||||
errors++;
|
||||
if (errors <= 10) console.warn(` Error for ${url}: ${(err as Error).message.slice(0, 70)}`);
|
||||
}
|
||||
console.error(` Category failed: ${(err as Error).message}`);
|
||||
}
|
||||
|
||||
console.log("\n=== NADDOD Scraper v2 Complete ===");
|
||||
console.log(` Products processed: ${processed}`);
|
||||
console.log(` Non-transceivers skip: ${skippedNonTx}`);
|
||||
console.log(` Price observations: ${priceUpdates} new`);
|
||||
console.log(` Stock observations: ${stockWritten} new, ${stockSkipped} unchanged`);
|
||||
if (errors > 0) console.warn(` Errors: ${errors}`);
|
||||
await sleep(2000);
|
||||
}
|
||||
|
||||
console.log(`\n=== NADDOD Complete: ${totalProducts} products, ${priceUpdates} price updates ===`);
|
||||
}
|
||||
|
||||
if (require.main === module) {
|
||||
|
||||
@ -1,257 +0,0 @@
|
||||
/**
|
||||
* OpticsBay Scraper — North American compatible transceiver vendor
|
||||
*
|
||||
* opticsbay.com — WooCommerce store, USD/CAD pricing.
|
||||
* Covers SFP/SFP+/QSFP28/QSFP-DD with competitive pricing.
|
||||
* Rate limited: 1 req/2sec.
|
||||
*
|
||||
* OpticsBay offers compatible optics for Cisco, Juniper, Arista, HPE.
|
||||
* Transparent pricing, no login required.
|
||||
*/
|
||||
import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db";
|
||||
import { contentHash } from "../utils/hash";
|
||||
|
||||
const BASE = "https://www.opticsbay.com";
|
||||
const HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
};
|
||||
|
||||
const MAX_PAGES = 20;
|
||||
|
||||
const CATEGORIES = [
|
||||
{ path: "/product-category/sfp-transceiver/", formFactor: "SFP", speed: "1G", speedGbps: 1 },
|
||||
{ path: "/product-category/sfp-plus-transceiver/", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
|
||||
{ path: "/product-category/sfp28-transceiver/", formFactor: "SFP28", speed: "25G", speedGbps: 25 },
|
||||
{ path: "/product-category/qsfp-plus-transceiver/", formFactor: "QSFP+", speed: "40G", speedGbps: 40 },
|
||||
{ path: "/product-category/qsfp28-transceiver/", formFactor: "QSFP28", speed: "100G", speedGbps: 100 },
|
||||
{ path: "/product-category/qsfp-dd-transceiver/", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 },
|
||||
{ path: "/product-category/dac-cable/", formFactor: "DAC", speed: "10G", speedGbps: 10 },
|
||||
{ path: "/product-category/transceivers/", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
|
||||
];
|
||||
|
||||
interface Product {
|
||||
partNumber: string;
|
||||
name: string;
|
||||
url: string;
|
||||
price?: number;
|
||||
formFactor: string;
|
||||
speed: string;
|
||||
speedGbps: number;
|
||||
reachLabel?: string;
|
||||
reachMeters?: number;
|
||||
fiberType?: string;
|
||||
wavelength?: string;
|
||||
}
|
||||
|
||||
function sleep(ms: number): Promise<void> {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
function detectReach(text: string): { label: string; meters: number } | undefined {
|
||||
const patterns: [RegExp, string, number][] = [
|
||||
[/\b120\s*km\b/i, "120km", 120000],
|
||||
[/\b80\s*km\b/i, "80km", 80000],
|
||||
[/\b40\s*km\b/i, "40km", 40000],
|
||||
[/\b20\s*km\b/i, "20km", 20000],
|
||||
[/\b10\s*km\b/i, "10km", 10000],
|
||||
[/\b2\s*km\b/i, "2km", 2000],
|
||||
[/\b550\s*m\b/i, "550m", 550],
|
||||
[/\b300\s*m\b/i, "300m", 300],
|
||||
[/\b100\s*m\b/i, "100m", 100],
|
||||
[/\bLR4\b/, "10km", 10000], [/\bLR\b/, "10km", 10000],
|
||||
[/\bER4?\b/, "40km", 40000], [/\bZR4?\b/, "80km", 80000],
|
||||
[/\bSR4?\b/, "300m", 300], [/\bDR4?\b/, "500m", 500],
|
||||
[/\bFR4?\b/, "2km", 2000],
|
||||
];
|
||||
for (const [re, label, meters] of patterns) {
|
||||
if (re.test(text)) return { label, meters };
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function detectFiber(text: string): string {
|
||||
if (/single.?mode|smf|[^a-z]lx[^a-z]|[^a-z]lr[^a-z]|[^a-z]er[^a-z]|[^a-z]zr[^a-z]|bidi|cwdm|dwdm/i.test(text)) return "SMF";
|
||||
if (/multi.?mode|mmf|[^a-z]sx[^a-z]|[^a-z]sr[^a-z]/i.test(text)) return "MMF";
|
||||
if (/copper|dac|twinax|rj.?45|base-t/i.test(text)) return "Copper";
|
||||
return "";
|
||||
}
|
||||
|
||||
function detectWavelength(text: string): string {
|
||||
const m = text.match(/(\d{3,4})\s*nm/i);
|
||||
return m ? m[1] : "";
|
||||
}
|
||||
|
||||
function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product[] {
|
||||
const products: Product[] = [];
|
||||
const seen = new Set<string>();
|
||||
const collapsed = html.replace(/\s+/g, " ");
|
||||
|
||||
// Strategy 1: WooCommerce standard product loop
|
||||
for (const m of collapsed.matchAll(/<li[^>]+class="[^"]*product[^"]*"[^>]*>([\s\S]*?)<\/li>/gi)) {
|
||||
const card = m[1];
|
||||
const urlMatch = card.match(/href="(https?:\/\/(?:www\.)?opticsbay\.com\/[^"?#]+)"/i);
|
||||
if (!urlMatch) continue;
|
||||
const url = urlMatch[1];
|
||||
if (seen.has(url)) continue;
|
||||
seen.add(url);
|
||||
|
||||
const nameMatch =
|
||||
card.match(/woocommerce-loop-product__title[^>]*>([^<]+)</i) ||
|
||||
card.match(/<h[23][^>]*>([^<]{8,})<\/h[23]>/i);
|
||||
if (!nameMatch) continue;
|
||||
const name = nameMatch[1].trim().replace(/&/g, "&").replace(/&#\d+;/g, "");
|
||||
if (name.length < 5) continue;
|
||||
|
||||
const priceMatch = card.match(/\$\s*([\d,]+\.?\d*)/);
|
||||
const price = priceMatch ? parseFloat(priceMatch[1].replace(/,/g, "")) : undefined;
|
||||
const reach = detectReach(name);
|
||||
|
||||
products.push({
|
||||
partNumber: name.split(/\s+(?:compatible|for\s+[A-Z])/i)[0]?.trim().slice(0, 80) || name.slice(0, 60),
|
||||
name, url,
|
||||
price: price && price > 0 && price < 100000 ? price : undefined,
|
||||
formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps,
|
||||
reachLabel: reach?.label, reachMeters: reach?.meters,
|
||||
fiberType: detectFiber(name), wavelength: detectWavelength(name),
|
||||
});
|
||||
}
|
||||
|
||||
// Strategy 2: Generic product link scan
|
||||
if (products.length === 0) {
|
||||
for (const m of collapsed.matchAll(/href="(https?:\/\/(?:www\.)?opticsbay\.com\/(?:product|shop)\/[^"?#]+)"[^>]*>([^<]{8,})</gi)) {
|
||||
const url = m[1];
|
||||
const name = m[2].trim().replace(/&/g, "&");
|
||||
if (seen.has(url) || name.length < 8) continue;
|
||||
if (!/transceiver|sfp|qsfp|dac|optic/i.test(name + url)) continue;
|
||||
seen.add(url);
|
||||
|
||||
const idx = collapsed.indexOf(url);
|
||||
const ctx = collapsed.slice(Math.max(0, idx - 300), idx + 600);
|
||||
const priceM = ctx.match(/\$\s*([\d,]+\.?\d*)/);
|
||||
const price = priceM ? parseFloat(priceM[1].replace(/,/g, "")) : undefined;
|
||||
const reach = detectReach(name);
|
||||
|
||||
products.push({
|
||||
partNumber: name.split(/\s+/)[0]?.slice(0, 80) || "",
|
||||
name, url,
|
||||
price: price && price > 0 && price < 100000 ? price : undefined,
|
||||
formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps,
|
||||
reachLabel: reach?.label, reachMeters: reach?.meters,
|
||||
fiberType: detectFiber(name), wavelength: detectWavelength(name),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return products;
|
||||
}
|
||||
|
||||
async function fetchPage(url: string): Promise<string> {
|
||||
const resp = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(30000) });
|
||||
if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`);
|
||||
return resp.text();
|
||||
}
|
||||
|
||||
export async function scrapeOpticsBay(): Promise<void> {
|
||||
console.log("=== OpticsBay Scraper Starting ===\n");
|
||||
|
||||
const vendorId = await ensureVendor(
|
||||
"OpticsBay",
|
||||
"compatible",
|
||||
"https://www.opticsbay.com",
|
||||
"https://www.opticsbay.com/product-category/transceivers/",
|
||||
);
|
||||
|
||||
let totalProducts = 0;
|
||||
let priceUpdates = 0;
|
||||
const seenCategories = new Set<string>();
|
||||
|
||||
for (const cat of CATEGORIES) {
|
||||
console.log(`\n--- ${cat.formFactor} (${cat.speed}) [${cat.path}] ---`);
|
||||
|
||||
try {
|
||||
const html1 = await fetchPage(BASE + cat.path);
|
||||
const catProducts = parseProductList(html1, cat);
|
||||
|
||||
if (cat.path.includes("/transceivers/") && seenCategories.size > 3) {
|
||||
console.log(` Skipping generic fallback (${seenCategories.size} categories scraped)`);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (catProducts.length === 0) {
|
||||
console.log(" No products on page 1 — skipping");
|
||||
continue;
|
||||
}
|
||||
|
||||
seenCategories.add(cat.path);
|
||||
console.log(` Found ${catProducts.length} products on page 1`);
|
||||
|
||||
const allProducts = [...catProducts];
|
||||
|
||||
for (let page = 2; page <= MAX_PAGES; page++) {
|
||||
await sleep(2000);
|
||||
try {
|
||||
const pageUrl = `${BASE}${cat.path}page/${page}/`;
|
||||
const html = await fetchPage(pageUrl);
|
||||
const pageProds = parseProductList(html, cat);
|
||||
if (pageProds.length === 0) break;
|
||||
allProducts.push(...pageProds);
|
||||
console.log(` Page ${page}: ${pageProds.length} products`);
|
||||
} catch (err) {
|
||||
console.warn(` Page ${page} failed: ${(err as Error).message.slice(0, 60)}`);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
const uniqueProducts = allProducts.filter((p, i, arr) => arr.findIndex(x => x.url === p.url) === i);
|
||||
console.log(` Total unique: ${uniqueProducts.length}`);
|
||||
|
||||
for (const product of uniqueProducts) {
|
||||
try {
|
||||
const txId = await findOrCreateScrapedTransceiver({
|
||||
partNumber: product.partNumber,
|
||||
vendorId,
|
||||
formFactor: product.formFactor,
|
||||
speedGbps: product.speedGbps,
|
||||
speed: product.speed,
|
||||
reachMeters: product.reachMeters,
|
||||
reachLabel: product.reachLabel,
|
||||
fiberType: product.fiberType,
|
||||
wavelengths: product.wavelength,
|
||||
category: "DataCenter",
|
||||
});
|
||||
|
||||
if (product.price && product.price > 0) {
|
||||
const hash = contentHash({ price: product.price, part: product.partNumber });
|
||||
const updated = await upsertPriceObservation({
|
||||
transceiverId: txId,
|
||||
sourceVendorId: vendorId,
|
||||
price: product.price,
|
||||
currency: "USD",
|
||||
stockLevel: "in_stock",
|
||||
url: product.url,
|
||||
contentHash: hash,
|
||||
});
|
||||
if (updated) priceUpdates++;
|
||||
}
|
||||
totalProducts++;
|
||||
} catch (err) {
|
||||
console.warn(` DB error: ${(err as Error).message.slice(0, 80)}`);
|
||||
}
|
||||
}
|
||||
} catch (err) {
|
||||
console.error(` Category failed: ${(err as Error).message}`);
|
||||
}
|
||||
|
||||
await sleep(2000);
|
||||
}
|
||||
|
||||
console.log(`\n=== OpticsBay Complete: ${totalProducts} products, ${priceUpdates} price updates ===`);
|
||||
}
|
||||
|
||||
if (require.main === module) {
|
||||
scrapeOpticsBay()
|
||||
.then(() => pool.end())
|
||||
.catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); });
|
||||
}
|
||||
@ -1,57 +1,35 @@
|
||||
/**
|
||||
* QSFPTEK Scraper — Chinese compatible transceiver vendor
|
||||
*
|
||||
* qsfptek.com — migrated to custom Java/Spring + Vue.js in 2025.
|
||||
* Old OpenCart /c/*.html paths are gone (404).
|
||||
* Products are served via HTML-fragment API: /mall/commodity/list
|
||||
*
|
||||
* API: GET /mall/commodity/list?categoryId=21&attributes=VALUE_ID&page=N&pageSize=30
|
||||
* Returns HTML fragment with product cards.
|
||||
*
|
||||
* Phase 1: Collect product list + prices via API (plain HTTP, no JS needed)
|
||||
* Phase 2: Fetch product detail pages to extract real-time stock count
|
||||
* Format: "5507 in real-time stock, 17 Apr, 2026"
|
||||
* Confidence: 2 (aggregated global count with vendor timestamp)
|
||||
*
|
||||
* qsfptek.com — Server-rendered HTML shop, USD pricing.
|
||||
* Focuses on QSFP+/QSFP28/QSFP-DD/SFP+ form factors.
|
||||
* Rate limited: 1 req/2sec.
|
||||
*
|
||||
* QSFPTEK (Shenzhen Optotech Technology) — competitive pricing,
|
||||
* transparent USD prices, no account required.
|
||||
*/
|
||||
import * as cheerio from "cheerio";
|
||||
import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation, upsertStockObservation } from "../utils/db";
|
||||
import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db";
|
||||
import { contentHash } from "../utils/hash";
|
||||
|
||||
const BASE = "https://www.qsfptek.com";
|
||||
const LIST_API = "/mall/commodity/list";
|
||||
const CATEGORY_ID = "21"; // Transceivers (top-level, all products)
|
||||
|
||||
const HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
|
||||
Accept: "text/html,application/xhtml+xml",
|
||||
"User-Agent": "Mozilla/5.0 (compatible; TIP-Bot/1.0; research)",
|
||||
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
};
|
||||
|
||||
const MAX_PAGES = 40;
|
||||
const PAGE_SIZE = 30;
|
||||
// Limit detail-page fetches per run to avoid overwhelming the server
|
||||
// (~500 products × 2s = ~17min for a full refresh; subsequent runs are faster
|
||||
// since unchanged stock counts are skipped by upsertStockObservation)
|
||||
const MAX_DETAIL_PAGES = 500;
|
||||
const MAX_PAGES = 30;
|
||||
|
||||
// Data rate attribute values (found in /mall/commodity/attribute?categoryId=21)
|
||||
// pid = "2c9180837bbaf08f017bbdd1ebf7001e" (Data Rate attribute group)
|
||||
const DATA_RATE_ATTRIBUTES: Array<{
|
||||
attrId: string;
|
||||
speed: string;
|
||||
speedGbps: number;
|
||||
formFactor: string;
|
||||
}> = [
|
||||
{ attrId: "2c9180837bbaf08f017bbde50ea30101", speed: "1G", speedGbps: 1, formFactor: "SFP" },
|
||||
{ attrId: "2c9180837bbaf08f017bbde50ea30100", speed: "10G", speedGbps: 10, formFactor: "SFP+" },
|
||||
{ attrId: "2c9180837bbaf08f017bbde50ea300ff", speed: "25G", speedGbps: 25, formFactor: "SFP28" },
|
||||
{ attrId: "2c9180837bbaf08f017bbde50ea300fe", speed: "40G", speedGbps: 40, formFactor: "QSFP+" },
|
||||
{ attrId: "2c9180837bbaf08f017bbde50ea300fd", speed: "100G", speedGbps: 100, formFactor: "QSFP28" },
|
||||
{ attrId: "2c98491f8f4b8e55018f94aa8c5d48ff", speed: "200G", speedGbps: 200, formFactor: "QSFP56" },
|
||||
{ attrId: "2c9180837e2e7f64017e389caf0700c8", speed: "400G", speedGbps: 400, formFactor: "QSFP-DD" },
|
||||
{ attrId: "2c98491f8e363cbf018e3faa5b6d2f8b", speed: "800G", speedGbps: 800, formFactor: "OSFP" },
|
||||
const CATEGORIES = [
|
||||
{ path: "/c/sfp-transceiver.html", formFactor: "SFP", speed: "1G", speedGbps: 1 },
|
||||
{ path: "/c/sfp-plus-transceiver.html", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
|
||||
{ path: "/c/sfp28-transceiver.html", formFactor: "SFP28", speed: "25G", speedGbps: 25 },
|
||||
{ path: "/c/qsfp-plus-transceiver.html", formFactor: "QSFP+", speed: "40G", speedGbps: 40 },
|
||||
{ path: "/c/qsfp28-transceiver.html", formFactor: "QSFP28", speed: "100G", speedGbps: 100 },
|
||||
{ path: "/c/qsfp56-transceiver.html", formFactor: "QSFP56", speed: "200G", speedGbps: 200 },
|
||||
{ path: "/c/qsfp-dd-transceiver.html", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 },
|
||||
{ path: "/c/osfp-transceiver.html", formFactor: "OSFP", speed: "800G", speedGbps: 800 },
|
||||
{ path: "/c/optical-transceiver.html", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
|
||||
];
|
||||
|
||||
interface Product {
|
||||
@ -66,11 +44,7 @@ interface Product {
|
||||
reachMeters?: number;
|
||||
fiberType?: string;
|
||||
wavelength?: string;
|
||||
}
|
||||
|
||||
interface StockDetail {
|
||||
qty: number;
|
||||
vendorTs: Date | null;
|
||||
compatibleWith?: string;
|
||||
}
|
||||
|
||||
function sleep(ms: number): Promise<void> {
|
||||
@ -86,15 +60,19 @@ function detectReach(text: string): { label: string; meters: number } | undefine
|
||||
[/\b10\s*km\b/i, "10km", 10000],
|
||||
[/\b2\s*km\b/i, "2km", 2000],
|
||||
[/\b550\s*m\b/i, "550m", 550],
|
||||
[/\b500\s*m\b/i, "500m", 500],
|
||||
[/\b300\s*m\b/i, "300m", 300],
|
||||
[/\b100\s*m\b/i, "100m", 100],
|
||||
[/\bLR4\b/, "10km", 10000], [/\bLR\b/, "10km", 10000],
|
||||
[/\bER4?\b/, "40km", 40000], [/\bZR4?\b/, "80km", 80000],
|
||||
[/\bSR4?\b/, "300m", 300], [/\bDR4?\b/, "500m", 500],
|
||||
[/\bLR4\b/, "10km", 10000],
|
||||
[/\bLR\b/, "10km", 10000],
|
||||
[/\bER4?\b/, "40km", 40000],
|
||||
[/\bZR4?\b/, "80km", 80000],
|
||||
[/\bSR4?\b/, "300m", 300],
|
||||
[/\bDR4?\b/, "500m", 500],
|
||||
[/\bFR4?\b/, "2km", 2000],
|
||||
];
|
||||
for (const [re, label, meters] of patterns) {
|
||||
if (re.test(text)) return { label, meters };
|
||||
for (const [regex, label, meters] of patterns) {
|
||||
if (regex.test(text)) return { label, meters };
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
@ -107,163 +85,153 @@ function detectFiber(text: string): string {
|
||||
}
|
||||
|
||||
function detectWavelength(text: string): string {
|
||||
const m = text.match(/(\d{3,4})\s*nm/i);
|
||||
return m ? m[1] : "";
|
||||
const match = text.match(/(\d{3,4})\s*nm/i);
|
||||
return match ? match[1] : "";
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse QSFPTEK real-time stock text.
|
||||
* Format: "5507 in real-time stock, 17 Apr, 2026"
|
||||
* Returns { qty, vendorTs } or null if not found.
|
||||
*/
|
||||
function parseStockDetail(html: string): StockDetail | null {
|
||||
// Match: "<number> in real-time stock, <date>"
|
||||
const m = html.match(/(\d[\d,]*)\s+in\s+real-?time\s+stock[,\s]+(\d{1,2}\s+\w+,?\s*\d{4})/i);
|
||||
if (!m) {
|
||||
// Also try: "<number> in stock" without timestamp
|
||||
const simple = html.match(/(\d[\d,]+)\s+in\s+(?:real-?time\s+)?stock\b/i);
|
||||
if (simple) {
|
||||
const qty = parseInt(simple[1].replace(/,/g, ""), 10);
|
||||
return isNaN(qty) || qty < 0 ? null : { qty, vendorTs: null };
|
||||
function extractCompatibleVendor(name: string): string {
|
||||
const brands = ["Cisco", "Juniper", "Arista", "HPE", "Aruba", "Dell", "Brocade", "Extreme",
|
||||
"Huawei", "Nokia", "MikroTik", "Mellanox", "Nvidia", "Ubiquiti", "Allied Telesis"];
|
||||
for (const brand of brands) {
|
||||
if (new RegExp(`\\b${brand}\\b`, "i").test(name)) return brand;
|
||||
}
|
||||
return null;
|
||||
return "";
|
||||
}
|
||||
|
||||
const qty = parseInt(m[1].replace(/,/g, ""), 10);
|
||||
if (isNaN(qty) || qty < 0) return null;
|
||||
|
||||
// Parse vendor timestamp: "17 Apr, 2026" → Date
|
||||
let vendorTs: Date | null = null;
|
||||
try {
|
||||
const dateStr = m[2].replace(",", "");
|
||||
const d = new Date(dateStr);
|
||||
if (!isNaN(d.getTime())) vendorTs = d;
|
||||
} catch {
|
||||
// ignore unparseable date
|
||||
}
|
||||
|
||||
return { qty, vendorTs };
|
||||
}
|
||||
|
||||
function parseProductFragment(html: string, attr: typeof DATA_RATE_ATTRIBUTES[number]): Product[] {
|
||||
const $ = cheerio.load(html);
|
||||
function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product[] {
|
||||
const products: Product[] = [];
|
||||
const seen = new Set<string>();
|
||||
const collapsed = html.replace(/\s+/g, " ");
|
||||
|
||||
// Product cards: li.Hot, li.New, li with class containing product
|
||||
$("li, div.kuang").each((_i, el) => {
|
||||
const $el = $(el);
|
||||
// Strategy 1: OpenCart / custom card layout using matchAll
|
||||
for (const cardMatch of collapsed.matchAll(/<div[^>]+class="[^"]*product-(?:thumb|layout)[^"]*"[^>]*>([\s\S]*?)<\/div>\s*<\/div>/gi)) {
|
||||
const card = cardMatch[1];
|
||||
|
||||
// Get product URL
|
||||
const href = $el.find("a[href*='/product/']").first().attr("href") ||
|
||||
$el.find("a[href*='/en/product/']").first().attr("href");
|
||||
if (!href) return;
|
||||
const url = href.startsWith("http") ? href : BASE + href;
|
||||
if (seen.has(url)) return;
|
||||
const urlMatch = card.match(/href="(https?:\/\/(?:www\.)?qsfptek\.com\/[^"]+)"/i);
|
||||
if (!urlMatch) continue;
|
||||
const url = urlMatch[1];
|
||||
if (seen.has(url)) continue;
|
||||
seen.add(url);
|
||||
|
||||
// Get product name from h3
|
||||
const name = $el.find("h3.tit, h3").first().text().trim().replace(/\s+/g, " ");
|
||||
if (!name || name.length < 10) return;
|
||||
const nameMatch = card.match(/<h[34][^>]*>\s*<a[^>]*>([^<]{10,})<\/a>/i) ||
|
||||
card.match(/<a[^>]*title="([^"]{10,})"/i);
|
||||
if (!nameMatch) continue;
|
||||
const name = nameMatch[1].trim().replace(/&/g, "&").replace(/&#[0-9]+;/g, "");
|
||||
if (name.length < 5) continue;
|
||||
|
||||
// Get price: "US$ 33.90" format
|
||||
const priceText = $el.find("*").text();
|
||||
const priceMatch = priceText.match(/US\$\s*([\d,]+\.?\d{0,2})/);
|
||||
const priceMatch = card.match(/\$\s*([\d,]+\.?\d*)/);
|
||||
const price = priceMatch ? parseFloat(priceMatch[1].replace(/,/g, "")) : undefined;
|
||||
|
||||
const reach = detectReach(name);
|
||||
const partNumber = name.split(/\s+(?:compatible|for\s+[A-Z]|sfp|qsfp)/i)[0]?.trim().slice(0, 80) || name.slice(0, 60);
|
||||
const partNumber = name.split(/\s+(?:compatible|for|sfp|qsfp)/i)[0]?.trim().slice(0, 80) || name.slice(0, 60);
|
||||
|
||||
products.push({
|
||||
partNumber, name, url,
|
||||
price: price && price > 0 && price < 100000 ? price : undefined,
|
||||
formFactor: attr.formFactor, speed: attr.speed, speedGbps: attr.speedGbps,
|
||||
formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps,
|
||||
reachLabel: reach?.label, reachMeters: reach?.meters,
|
||||
fiberType: detectFiber(name), wavelength: detectWavelength(name),
|
||||
compatibleWith: extractCompatibleVendor(name),
|
||||
});
|
||||
}
|
||||
|
||||
// Strategy 2: Generic product link scan using matchAll
|
||||
if (products.length === 0) {
|
||||
for (const m of collapsed.matchAll(/href="(https?:\/\/(?:www\.)?qsfptek\.com\/(?:p|product)[^"?#]+)"[^>]*>([^<]{10,})</gi)) {
|
||||
const url = m[1];
|
||||
const name = m[2].trim().replace(/&/g, "&");
|
||||
if (seen.has(url) || name.length < 10) continue;
|
||||
if (!/transceiver|sfp|qsfp|osfp|dac|aoc/i.test(name)) continue;
|
||||
seen.add(url);
|
||||
|
||||
const idx = collapsed.indexOf(url);
|
||||
const ctx = collapsed.slice(Math.max(0, idx - 300), idx + 600);
|
||||
const priceM = ctx.match(/\$\s*([\d,]+\.?\d*)/);
|
||||
const price = priceM ? parseFloat(priceM[1].replace(/,/g, "")) : undefined;
|
||||
const reach = detectReach(name);
|
||||
|
||||
products.push({
|
||||
partNumber: name.split(/\s+/)[0]?.slice(0, 80) || "",
|
||||
name, url,
|
||||
price: price && price > 0 && price < 100000 ? price : undefined,
|
||||
formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps,
|
||||
reachLabel: reach?.label, reachMeters: reach?.meters,
|
||||
fiberType: detectFiber(name), wavelength: detectWavelength(name),
|
||||
compatibleWith: extractCompatibleVendor(name),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return products;
|
||||
}
|
||||
|
||||
async function fetchProductList(attrId: string, page: number): Promise<string> {
|
||||
const url = `${BASE}${LIST_API}?categoryId=${CATEGORY_ID}&attributes=${attrId}&page=${page}&pageSize=${PAGE_SIZE}`;
|
||||
async function fetchPage(url: string): Promise<string> {
|
||||
const resp = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(30000) });
|
||||
if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`);
|
||||
return resp.text();
|
||||
}
|
||||
|
||||
async function fetchDetailPage(url: string): Promise<string> {
|
||||
// Normalise URL: ensure /en/ prefix for detail pages
|
||||
const normalized = url.includes("/en/product/") ? url : url.replace("/product/", "/en/product/");
|
||||
const resp = await fetch(normalized, { headers: HEADERS, signal: AbortSignal.timeout(30000) });
|
||||
if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${normalized}`);
|
||||
return resp.text();
|
||||
}
|
||||
|
||||
export async function scrapeQsfptek(): Promise<void> {
|
||||
console.log("=== QSFPTEK Scraper v2 Starting (API + stock detail mode) ===\n");
|
||||
console.log("=== QSFPTEK Scraper Starting ===\n");
|
||||
|
||||
const vendorId = await ensureVendor(
|
||||
"QSFPTEK",
|
||||
"compatible",
|
||||
"https://www.qsfptek.com",
|
||||
"https://www.qsfptek.com/c/fiber-optic-transceiver.html",
|
||||
"https://www.qsfptek.com/c/optical-transceiver.html",
|
||||
);
|
||||
|
||||
// ── Phase 1: Collect all products via API listing ─────────────────────────
|
||||
console.log("[Phase 1] Collecting product catalog...");
|
||||
const allProducts = new Map<string, Product>(); // url → product
|
||||
|
||||
for (const attr of DATA_RATE_ATTRIBUTES) {
|
||||
console.log(`\n ${attr.formFactor} (${attr.speed})`);
|
||||
|
||||
for (let page = 1; page <= MAX_PAGES; page++) {
|
||||
try {
|
||||
const html = await fetchProductList(attr.attrId, page);
|
||||
const pageProds = parseProductFragment(html, attr);
|
||||
|
||||
if (pageProds.length === 0) {
|
||||
if (page === 1) console.log(" No products on page 1 — skipping");
|
||||
else console.log(` Page ${page}: empty, stopping`);
|
||||
break;
|
||||
}
|
||||
|
||||
let newCount = 0;
|
||||
for (const p of pageProds) {
|
||||
if (!allProducts.has(p.url)) {
|
||||
allProducts.set(p.url, p);
|
||||
newCount++;
|
||||
}
|
||||
}
|
||||
console.log(` Page ${page}: ${pageProds.length} results, ${newCount} new (${allProducts.size} total)`);
|
||||
|
||||
if (pageProds.length < PAGE_SIZE) break;
|
||||
if (page < MAX_PAGES) await sleep(2000);
|
||||
} catch (err) {
|
||||
console.warn(` Page ${page} failed: ${(err as Error).message.slice(0, 80)}`);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
await sleep(2000);
|
||||
}
|
||||
|
||||
console.log(`\n[Phase 1] Complete — ${allProducts.size} unique products collected`);
|
||||
|
||||
// ── Phase 2: Write to DB + fetch detail pages for stock counts ─────────────
|
||||
console.log("\n[Phase 2] Writing prices + fetching real-time stock counts...");
|
||||
|
||||
let totalProducts = 0;
|
||||
let priceUpdates = 0;
|
||||
let stockWritten = 0;
|
||||
let stockSkipped = 0;
|
||||
let detailFetched = 0;
|
||||
let errors = 0;
|
||||
const seenCategories = new Set<string>();
|
||||
|
||||
const products = [...allProducts.values()].slice(0, MAX_DETAIL_PAGES);
|
||||
for (const cat of CATEGORIES) {
|
||||
console.log(`\n--- ${cat.formFactor} (${cat.speed}) [${cat.path}] ---`);
|
||||
|
||||
for (const product of products) {
|
||||
try {
|
||||
const html1 = await fetchPage(BASE + cat.path);
|
||||
const catProducts = parseProductList(html1, cat);
|
||||
|
||||
if (cat.path.includes("/optical-transceiver") && seenCategories.size > 3) {
|
||||
console.log(` Skipping generic fallback (${seenCategories.size} specific categories scraped)`);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (catProducts.length === 0) {
|
||||
console.log(" No products on page 1 — skipping");
|
||||
continue;
|
||||
}
|
||||
|
||||
seenCategories.add(cat.path);
|
||||
console.log(` Found ${catProducts.length} products on page 1`);
|
||||
|
||||
const totalPagesMatch =
|
||||
html1.match(/total-page[^>]*>\s*(\d+)/) ||
|
||||
html1.match(/page\s+\d+\s+of\s+(\d+)/i);
|
||||
const totalPages = totalPagesMatch ? Math.min(parseInt(totalPagesMatch[1]), MAX_PAGES) : 3;
|
||||
console.log(` Total pages (estimate): ${totalPages}`);
|
||||
|
||||
const allProducts = [...catProducts];
|
||||
|
||||
for (let page = 2; page <= totalPages; page++) {
|
||||
await sleep(2000);
|
||||
try {
|
||||
const pageUrl = BASE + cat.path.replace(".html", "") + `?page=${page}`;
|
||||
const html = await fetchPage(pageUrl);
|
||||
const pageProds = parseProductList(html, cat);
|
||||
if (pageProds.length === 0) break;
|
||||
allProducts.push(...pageProds);
|
||||
console.log(` Page ${page}: ${pageProds.length} products`);
|
||||
} catch (err) {
|
||||
console.warn(` Page ${page} failed: ${(err as Error).message.slice(0, 60)}`);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
const uniqueProducts = allProducts.filter((p, i, arr) => arr.findIndex((x) => x.url === p.url) === i);
|
||||
console.log(` Total unique: ${uniqueProducts.length}`);
|
||||
|
||||
for (const product of uniqueProducts) {
|
||||
try {
|
||||
const txId = await findOrCreateScrapedTransceiver({
|
||||
partNumber: product.partNumber,
|
||||
@ -278,7 +246,6 @@ export async function scrapeQsfptek(): Promise<void> {
|
||||
category: "DataCenter",
|
||||
});
|
||||
|
||||
// Price observation from listing page
|
||||
if (product.price && product.price > 0) {
|
||||
const hash = contentHash({ price: product.price, part: product.partNumber });
|
||||
const updated = await upsertPriceObservation({
|
||||
@ -292,51 +259,19 @@ export async function scrapeQsfptek(): Promise<void> {
|
||||
});
|
||||
if (updated) priceUpdates++;
|
||||
}
|
||||
|
||||
// Fetch detail page for real-time stock count
|
||||
await sleep(2000);
|
||||
try {
|
||||
const detailHtml = await fetchDetailPage(product.url);
|
||||
detailFetched++;
|
||||
const stockInfo = parseStockDetail(detailHtml);
|
||||
|
||||
if (stockInfo !== null) {
|
||||
const isNew = await upsertStockObservation({
|
||||
transceiverId: txId,
|
||||
sourceVendorId: vendorId,
|
||||
stockLevel: stockInfo.qty > 0 ? "in_stock" : "out_of_stock",
|
||||
quantityAvailable: stockInfo.qty > 0 ? stockInfo.qty : undefined,
|
||||
productUrl: product.url,
|
||||
// Quality metadata: QSFPTEK provides aggregated global count + timestamp
|
||||
stockConfidence: 2,
|
||||
priceCurrency: "USD",
|
||||
priceIncludesTax: false,
|
||||
stockVendorTs: stockInfo.vendorTs,
|
||||
});
|
||||
if (isNew) stockWritten++;
|
||||
else stockSkipped++;
|
||||
}
|
||||
} catch (detailErr) {
|
||||
// Detail page failures are non-fatal — we still have price data
|
||||
console.warn(` Stock fetch failed for ${product.partNumber}: ${(detailErr as Error).message.slice(0, 60)}`);
|
||||
}
|
||||
|
||||
totalProducts++;
|
||||
if (totalProducts % 50 === 0) {
|
||||
console.log(` Progress: ${totalProducts}/${products.length} products | ${priceUpdates} prices | ${stockWritten} stock obs`);
|
||||
} catch (err) {
|
||||
console.warn(` DB error: ${(err as Error).message.slice(0, 80)}`);
|
||||
}
|
||||
}
|
||||
} catch (err) {
|
||||
console.warn(` DB error for ${product.partNumber}: ${(err as Error).message.slice(0, 80)}`);
|
||||
errors++;
|
||||
}
|
||||
console.error(` Category failed: ${(err as Error).message}`);
|
||||
}
|
||||
|
||||
console.log("\n=== QSFPTEK Scraper v2 Complete ===");
|
||||
console.log(` Products processed: ${totalProducts}`);
|
||||
console.log(` Price observations: ${priceUpdates} new`);
|
||||
console.log(` Detail pages fetched: ${detailFetched}`);
|
||||
console.log(` Stock observations: ${stockWritten} new, ${stockSkipped} unchanged`);
|
||||
if (errors > 0) console.warn(` Errors: ${errors}`);
|
||||
await sleep(2000);
|
||||
}
|
||||
|
||||
console.log(`\n=== QSFPTEK Complete: ${totalProducts} products, ${priceUpdates} price updates ===`);
|
||||
}
|
||||
|
||||
if (require.main === module) {
|
||||
|
||||
@ -1,44 +1,26 @@
|
||||
/**
|
||||
* SmartOptics Scraper — Premium coherent/DWDM transceiver manufacturer
|
||||
*
|
||||
* smartoptics.com — WordPress/WooCommerce, no prices (B2B, RFQ only).
|
||||
* Scrapes product catalog for specs, images, and datasheets.
|
||||
*
|
||||
* v2 fixes:
|
||||
* - Multi-category crawl (coherent, DWDM, access, SFP, QSFP)
|
||||
* - Handles both absolute AND relative product URLs
|
||||
* - WooCommerce REST API fallback for complete product list
|
||||
* - Up to 10 pagination pages per category
|
||||
* smartoptics.com — WordPress site, no prices (B2B, RFQ model).
|
||||
* Scrapes product catalog for specs, images, datasheets.
|
||||
* Products listed at /products/optical-transceivers/ → individual /product/SKU/ pages.
|
||||
*/
|
||||
import { pool, findOrCreateScrapedTransceiver, ensureVendor } from "../utils/db";
|
||||
|
||||
const BASE = "https://smartoptics.com";
|
||||
const CATALOG_URL = `${BASE}/products/optical-transceivers/`;
|
||||
const HEADERS = {
|
||||
"User-Agent":
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
Accept: "text/html,application/xhtml+xml",
|
||||
};
|
||||
|
||||
/** All transceiver-related catalog category pages to crawl */
|
||||
const CATALOG_PAGES = [
|
||||
"/products/optical-transceivers/",
|
||||
"/products/",
|
||||
"/product-category/optical-transceivers/",
|
||||
"/product-category/transceivers/",
|
||||
"/product-category/sfp/",
|
||||
"/product-category/qsfp/",
|
||||
"/product-category/coherent/",
|
||||
"/product-category/dwdm/",
|
||||
];
|
||||
|
||||
function sleep(ms: number): Promise<void> {
|
||||
return new Promise((r) => setTimeout(r, ms));
|
||||
}
|
||||
|
||||
function detectFormFactor(text: string): { formFactor: string; speed: string; speedGbps: number } {
|
||||
const t = text.toLowerCase();
|
||||
if (t.includes("qsfp-dd800") || t.includes("800ge")) return { formFactor: "QSFP-DD", speed: "800G", speedGbps: 800 };
|
||||
if (t.includes("qsfp-dd800") || t.includes("sfp-dd800") || t.includes("800ge")) return { formFactor: "QSFP-DD", speed: "800G", speedGbps: 800 };
|
||||
if (t.includes("qsfp-dd") || (t.includes("400g") && t.includes("qsfp"))) return { formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 };
|
||||
if (t.includes("qsfp112")) return { formFactor: "QSFP112", speed: "400G", speedGbps: 400 };
|
||||
if (t.includes("qsfp56")) return { formFactor: "QSFP56", speed: "200G", speedGbps: 200 };
|
||||
@ -51,15 +33,22 @@ function detectFormFactor(text: string): { formFactor: string; speed: string; sp
|
||||
}
|
||||
|
||||
function detectReach(text: string): { label: string; meters: number } | undefined {
|
||||
const kmMatch = text.match(/(\d+(?:\.\d+)?)\s*km/i);
|
||||
if (kmMatch) { const km = parseFloat(kmMatch[1]); return { label: `${km}km`, meters: km * 1000 }; }
|
||||
const kmMatch = text.match(/(\d+)\s*km/i);
|
||||
if (kmMatch) {
|
||||
const km = parseInt(kmMatch[1]);
|
||||
return { label: `${km}km`, meters: km * 1000 };
|
||||
}
|
||||
const mMatch = text.match(/(\d+)\s*m\b/i);
|
||||
if (mMatch) { const m = parseInt(mMatch[1]); return { label: `${m}m`, meters: m }; }
|
||||
if (mMatch) {
|
||||
const m = parseInt(mMatch[1]);
|
||||
return { label: `${m}m`, meters: m };
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function detectFiber(text: string): string {
|
||||
if (/multi.?mode|mmf|sr\b/i.test(text)) return "MMF";
|
||||
if (/dwdm|cwdm|coherent|coh|single.?mode|smf/i.test(text)) return "SMF";
|
||||
if (/multi.?mode|mmf|sr/i.test(text)) return "MMF";
|
||||
return "SMF"; // SmartOptics is almost exclusively SMF/coherent
|
||||
}
|
||||
|
||||
@ -69,52 +58,15 @@ async function fetchPage(url: string): Promise<string> {
|
||||
return resp.text();
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract all /product/xxx/ URLs from an HTML page.
|
||||
* Handles both absolute (https://smartoptics.com/product/...) and
|
||||
* root-relative (/product/...) href patterns.
|
||||
*/
|
||||
function extractProductUrls(html: string, pageUrl: string): string[] {
|
||||
function extractProductUrls(html: string): string[] {
|
||||
const urls = new Set<string>();
|
||||
|
||||
// Absolute URLs
|
||||
const absRegex = /href="(https?:\/\/(?:www\.)?smartoptics\.com\/product\/[^"#?]+)"/gi;
|
||||
const regex = /href="(https?:\/\/smartoptics\.com\/product\/[^"]+)"/gi;
|
||||
let m: RegExpExecArray | null;
|
||||
while ((m = absRegex.exec(html)) !== null) {
|
||||
urls.add(normalizeProductUrl(m[1]));
|
||||
}
|
||||
|
||||
// Root-relative: href="/product/..." or href="/products/..." (individual product, not category)
|
||||
const relRegex = /href="(\/product\/[^"#?]+)"/gi;
|
||||
while ((m = relRegex.exec(html)) !== null) {
|
||||
urls.add(normalizeProductUrl(`${BASE}${m[1]}`));
|
||||
}
|
||||
|
||||
// WooCommerce data attributes: data-permalink or data-product-url
|
||||
const dataRegex = /data-(?:permalink|product-url)="([^"]*\/product\/[^"]+)"/gi;
|
||||
while ((m = dataRegex.exec(html)) !== null) {
|
||||
const u = m[1].startsWith("http") ? m[1] : `${BASE}${m[1]}`;
|
||||
urls.add(normalizeProductUrl(u));
|
||||
}
|
||||
|
||||
// Filter out category pages — only keep individual product URLs
|
||||
return Array.from(urls).filter((u) => {
|
||||
const path = new URL(u).pathname;
|
||||
// Must be /product/something — not /products/ (that's a category)
|
||||
return path.startsWith("/product/") && path.split("/").filter(Boolean).length >= 2;
|
||||
});
|
||||
}
|
||||
|
||||
function normalizeProductUrl(url: string): string {
|
||||
// Ensure trailing slash, strip query and fragment
|
||||
try {
|
||||
const u = new URL(url);
|
||||
let path = u.pathname;
|
||||
if (!path.endsWith("/")) path += "/";
|
||||
return `${u.origin}${path}`;
|
||||
} catch {
|
||||
return url;
|
||||
while ((m = regex.exec(html)) !== null) {
|
||||
const u = m[1].replace(/\/$/, "") + "/";
|
||||
urls.add(u);
|
||||
}
|
||||
return Array.from(urls);
|
||||
}
|
||||
|
||||
interface ProductData {
|
||||
@ -122,7 +74,6 @@ interface ProductData {
|
||||
name: string;
|
||||
url: string;
|
||||
imageUrl?: string;
|
||||
datasheetUrl?: string;
|
||||
formFactor: string;
|
||||
speed: string;
|
||||
speedGbps: number;
|
||||
@ -137,51 +88,23 @@ async function scrapeProductPage(url: string): Promise<ProductData | null> {
|
||||
try {
|
||||
const html = await fetchPage(url);
|
||||
|
||||
// Product name — try OG tag first (most reliable), then H1
|
||||
const nameMatch =
|
||||
html.match(/property="og:title"\s+content="([^"]+)"/) ||
|
||||
html.match(/content="([^"]+)"\s+property="og:title"/) ||
|
||||
html.match(/<h1[^>]*class="[^"]*(?:product_title|entry-title)[^"]*"[^>]*>([^<]+)<\/h1>/i) ||
|
||||
html.match(/<h1[^>]*>([^<]+)<\/h1>/);
|
||||
const rawName = nameMatch?.[1]?.trim() ?? "";
|
||||
const name = rawName.replace(/\s*\|\s*Smartoptics\s*$/, "").replace(/\s*–\s*Smartoptics\s*$/, "").trim();
|
||||
if (!name || name.length < 4) return null;
|
||||
const nameMatch = html.match(/<h1[^>]*>([^<]+)<\/h1>/) || html.match(/og:title" content="([^"]+)"/);
|
||||
const name = nameMatch ? nameMatch[1].trim().replace(/ \| Smartoptics$/, "") : "";
|
||||
if (!name) return null;
|
||||
|
||||
// SKU — try WooCommerce SKU field first
|
||||
const skuMatch =
|
||||
html.match(/(?:SKU|Artikelnummer)[^<]*<\/[^>]+>\s*<[^>]+>([A-Z0-9][-A-Z0-9./]{2,40})/i) ||
|
||||
html.match(/"sku"\s*:\s*"([^"]+)"/) ||
|
||||
html.match(/class="sku"[^>]*>([^<]+)</) ||
|
||||
html.match(/data-sku="([^"]+)"/);
|
||||
const sku = skuMatch?.[1]?.trim().toUpperCase() ||
|
||||
url.split("/").filter(Boolean).pop()?.toUpperCase().replace(/-/g, "") ||
|
||||
name.slice(0, 30).toUpperCase().replace(/\s+/g, "-");
|
||||
const sku = url.split("/").filter(Boolean).pop()?.toUpperCase() || name.replace(/\s+/g, "-");
|
||||
|
||||
// Product image
|
||||
const imgMatch =
|
||||
html.match(/property="og:image"\s+content="([^"]+)"/) ||
|
||||
html.match(/content="([^"]+)"\s+property="og:image"/) ||
|
||||
html.match(/<img[^>]+src="([^"]*wp-content\/uploads[^"]*\.(?:png|jpg|webp))"[^>]+class="[^"]*(?:wp-post-image|attachment-shop_single)[^"]*"/i);
|
||||
const imageUrl = imgMatch?.[1];
|
||||
|
||||
// Datasheet PDF link
|
||||
const dsMatch = html.match(/href="([^"]*\.pdf)"[^>]*>.*?(?:datasheet|datenblatt|spec)/gi);
|
||||
const datasheetUrl = dsMatch
|
||||
? (dsMatch[0].match(/href="([^"]+)"/) ?? [])[1]
|
||||
: undefined;
|
||||
const imgMatch = html.match(/property="og:image" content="([^"]+)"/)
|
||||
|| html.match(/<img[^>]+src="([^"]*wp-content\/uploads[^"]*\.(?:png|jpg|webp))"[^>]* class="[^"]*product/i);
|
||||
const imageUrl = imgMatch ? imgMatch[1] : undefined;
|
||||
|
||||
const ff = detectFormFactor(name);
|
||||
const reach = detectReach(name);
|
||||
const pageText = html.slice(0, 5000); // only check first 5KB for coherent detection
|
||||
const coherent = /coherent|coh-t|coh\.|dp-qpsk|qpsk|cfp2/i.test(name + pageText);
|
||||
const coherent = /coherent|coh-t|coh\.|dwdm|dp-qpsk|qpsk|cfp2/i.test(name + html.slice(0, 3000));
|
||||
const wdmType = /dwdm/i.test(name) ? "DWDM" : /cwdm/i.test(name) ? "CWDM" : undefined;
|
||||
|
||||
return {
|
||||
sku,
|
||||
name,
|
||||
url,
|
||||
imageUrl,
|
||||
datasheetUrl,
|
||||
sku, name, url, imageUrl,
|
||||
...ff,
|
||||
reachLabel: reach?.label,
|
||||
reachMeters: reach?.meters,
|
||||
@ -190,40 +113,14 @@ async function scrapeProductPage(url: string): Promise<ProductData | null> {
|
||||
wdmType,
|
||||
};
|
||||
} catch (err) {
|
||||
console.warn(` Failed ${url}: ${(err as Error).message.slice(0, 80)}`);
|
||||
console.warn(` Failed ${url}: ${(err as Error).message}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/** Try WooCommerce REST API for a complete product list (often publicly accessible) */
|
||||
async function tryWooCommerceApi(): Promise<string[]> {
|
||||
const urls: string[] = [];
|
||||
try {
|
||||
for (let page = 1; page <= 20; page++) {
|
||||
const apiUrl = `${BASE}/wp-json/wc/v3/products?per_page=100&page=${page}&category=optical-transceivers&status=publish`;
|
||||
const resp = await fetch(apiUrl, {
|
||||
headers: { ...HEADERS, Accept: "application/json" },
|
||||
signal: AbortSignal.timeout(10000),
|
||||
});
|
||||
if (!resp.ok) break;
|
||||
const products = await resp.json() as Array<{ permalink?: string; slug?: string }>;
|
||||
if (!Array.isArray(products) || products.length === 0) break;
|
||||
for (const p of products) {
|
||||
if (p.permalink) urls.push(normalizeProductUrl(p.permalink));
|
||||
else if (p.slug) urls.push(normalizeProductUrl(`${BASE}/product/${p.slug}/`));
|
||||
}
|
||||
if (products.length < 100) break;
|
||||
await sleep(500);
|
||||
}
|
||||
} catch {
|
||||
// API not accessible — not unusual, fall through to HTML crawl
|
||||
}
|
||||
return urls;
|
||||
}
|
||||
|
||||
export async function scrapeSmartOptics(): Promise<void> {
|
||||
console.log("=== SmartOptics Scraper v2 Starting ===\n");
|
||||
console.log("Note: SmartOptics is B2B — no public prices. Scraping specs + catalog only.\n");
|
||||
console.log("=== SmartOptics Scraper Starting ===\n");
|
||||
console.log("Note: SmartOptics is B2B — no public prices. Scraping specs + images only.\n");
|
||||
|
||||
const vendorId = await ensureVendor(
|
||||
"SmartOptics",
|
||||
@ -233,54 +130,32 @@ export async function scrapeSmartOptics(): Promise<void> {
|
||||
);
|
||||
|
||||
const productUrls = new Set<string>();
|
||||
|
||||
// ── Try WooCommerce REST API first (fastest, most complete) ──────────────
|
||||
console.log("[1] Trying WooCommerce REST API…");
|
||||
const apiUrls = await tryWooCommerceApi();
|
||||
if (apiUrls.length > 0) {
|
||||
console.log(` API returned ${apiUrls.length} products`);
|
||||
apiUrls.forEach((u) => productUrls.add(u));
|
||||
} else {
|
||||
console.log(" API not accessible — falling back to HTML crawl");
|
||||
}
|
||||
|
||||
// ── HTML catalog crawl (always run to catch any API misses) ───────────────
|
||||
console.log("[2] Crawling category pages…");
|
||||
for (const catPath of CATALOG_PAGES) {
|
||||
const catBase = `${BASE}${catPath}`;
|
||||
for (let page = 1; page <= 10; page++) {
|
||||
const pageUrl = page === 1 ? catBase : `${catBase}page/${page}/`;
|
||||
try {
|
||||
const html = await fetchPage(pageUrl);
|
||||
const found = extractProductUrls(html, pageUrl);
|
||||
if (found.length === 0 && page > 1) break; // no more pages in this category
|
||||
if (found.length === 0 && page === 1) break; // category doesn't exist
|
||||
found.forEach((u) => productUrls.add(u));
|
||||
console.log(` ${catPath} p${page}: ${found.length} products`);
|
||||
await sleep(1200);
|
||||
} catch (err) {
|
||||
const msg = (err as Error).message;
|
||||
if (!msg.includes("404")) console.warn(` ${pageUrl}: ${msg.slice(0, 60)}`);
|
||||
const url = page === 1 ? CATALOG_URL : `${CATALOG_URL}page/${page}/`;
|
||||
const html = await fetchPage(url);
|
||||
const urls = extractProductUrls(html);
|
||||
if (urls.length === 0) break;
|
||||
urls.forEach((u) => productUrls.add(u));
|
||||
console.log(` Catalog page ${page}: ${urls.length} products`);
|
||||
await sleep(1500);
|
||||
} catch {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`\nTotal unique product URLs: ${productUrls.size}`);
|
||||
console.log(`\nTotal product URLs: ${productUrls.size}`);
|
||||
if (productUrls.size === 0) {
|
||||
console.warn("No products found — SmartOptics site structure may have changed");
|
||||
console.log("No products found — site may have changed structure");
|
||||
return;
|
||||
}
|
||||
|
||||
// ── Scrape individual product pages ───────────────────────────────────────
|
||||
console.log("\n[3] Scraping product detail pages…");
|
||||
let saved = 0;
|
||||
let withImages = 0;
|
||||
let failed = 0;
|
||||
|
||||
for (const url of productUrls) {
|
||||
const product = await scrapeProductPage(url);
|
||||
if (!product) { failed++; continue; }
|
||||
if (!product) continue;
|
||||
|
||||
try {
|
||||
await findOrCreateScrapedTransceiver({
|
||||
@ -298,18 +173,14 @@ export async function scrapeSmartOptics(): Promise<void> {
|
||||
});
|
||||
saved++;
|
||||
if (product.imageUrl) withImages++;
|
||||
console.log(` ✓ ${product.sku.slice(0, 25).padEnd(25)} ${product.name.slice(0, 50)}`);
|
||||
console.log(` ✓ ${product.sku} — ${product.name.slice(0, 60)}`);
|
||||
} catch (err) {
|
||||
console.warn(` ✗ ${product.sku}: ${(err as Error).message.slice(0, 80)}`);
|
||||
console.warn(` Error saving ${product.sku}: ${(err as Error).message.slice(0, 80)}`);
|
||||
}
|
||||
await sleep(1200);
|
||||
await sleep(1500);
|
||||
}
|
||||
|
||||
console.log(`\n=== SmartOptics v2 Complete ===`);
|
||||
console.log(` Products discovered: ${productUrls.size}`);
|
||||
console.log(` Saved to DB: ${saved}`);
|
||||
console.log(` With images: ${withImages}`);
|
||||
if (failed > 0) console.warn(` Failed pages: ${failed}`);
|
||||
console.log(`\n=== SmartOptics Complete: ${saved} products, ${withImages} with images ===`);
|
||||
}
|
||||
|
||||
if (require.main === module) {
|
||||
|
||||
@ -1,284 +0,0 @@
|
||||
/**
|
||||
* Vcelink Scraper — Chinese compatible transceiver vendor
|
||||
*
|
||||
* vcelink.com — Shopify-based store, USD pricing.
|
||||
* Covers SFP/SFP+/QSFP28/QSFP-DD, DAC/AOC cables.
|
||||
* Rate limited: 1 req/2sec.
|
||||
*
|
||||
* Vcelink offers Cisco/Juniper/Arista-compatible optics at competitive
|
||||
* USD prices, publicly accessible without login.
|
||||
*/
|
||||
import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db";
|
||||
import { contentHash } from "../utils/hash";
|
||||
|
||||
const BASE = "https://www.vcelink.com";
|
||||
const HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
};
|
||||
|
||||
const MAX_PAGES = 20;
|
||||
|
||||
const CATEGORIES = [
|
||||
{ path: "/collections/1g-sfp", formFactor: "SFP", speed: "1G", speedGbps: 1 },
|
||||
{ path: "/collections/10g-sfp", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
|
||||
{ path: "/collections/25g-sfp28", formFactor: "SFP28", speed: "25G", speedGbps: 25 },
|
||||
{ path: "/collections/40g-qsfp", formFactor: "QSFP+", speed: "40G", speedGbps: 40 },
|
||||
{ path: "/collections/100g-qsfp28",formFactor: "QSFP28", speed: "100G", speedGbps: 100 },
|
||||
{ path: "/collections/400g-qsfp-dd",formFactor: "QSFP-DD",speed: "400G", speedGbps: 400 },
|
||||
{ path: "/collections/dac-cable", formFactor: "DAC", speed: "10G", speedGbps: 10 },
|
||||
{ path: "/collections/aoc-cable", formFactor: "AOC", speed: "10G", speedGbps: 10 },
|
||||
{ path: "/collections/all", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
|
||||
];
|
||||
|
||||
interface Product {
|
||||
partNumber: string;
|
||||
name: string;
|
||||
url: string;
|
||||
price?: number;
|
||||
formFactor: string;
|
||||
speed: string;
|
||||
speedGbps: number;
|
||||
reachLabel?: string;
|
||||
reachMeters?: number;
|
||||
fiberType?: string;
|
||||
wavelength?: string;
|
||||
}
|
||||
|
||||
function sleep(ms: number): Promise<void> {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
function detectReach(text: string): { label: string; meters: number } | undefined {
|
||||
const patterns: [RegExp, string, number][] = [
|
||||
[/\b120\s*km\b/i, "120km", 120000],
|
||||
[/\b80\s*km\b/i, "80km", 80000],
|
||||
[/\b40\s*km\b/i, "40km", 40000],
|
||||
[/\b20\s*km\b/i, "20km", 20000],
|
||||
[/\b10\s*km\b/i, "10km", 10000],
|
||||
[/\b2\s*km\b/i, "2km", 2000],
|
||||
[/\b550\s*m\b/i, "550m", 550],
|
||||
[/\b300\s*m\b/i, "300m", 300],
|
||||
[/\b100\s*m\b/i, "100m", 100],
|
||||
[/\bLR4\b/, "10km", 10000], [/\bLR\b/, "10km", 10000],
|
||||
[/\bER4?\b/, "40km", 40000], [/\bZR4?\b/, "80km", 80000],
|
||||
[/\bSR4?\b/, "300m", 300], [/\bDR4?\b/, "500m", 500],
|
||||
[/\bFR4?\b/, "2km", 2000],
|
||||
];
|
||||
for (const [re, label, meters] of patterns) {
|
||||
if (re.test(text)) return { label, meters };
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function detectFiber(text: string): string {
|
||||
if (/single.?mode|smf|[^a-z]lx[^a-z]|[^a-z]lr[^a-z]|[^a-z]er[^a-z]|[^a-z]zr[^a-z]|bidi|cwdm|dwdm/i.test(text)) return "SMF";
|
||||
if (/multi.?mode|mmf|[^a-z]sx[^a-z]|[^a-z]sr[^a-z]/i.test(text)) return "MMF";
|
||||
if (/copper|dac|twinax|rj.?45|base-t/i.test(text)) return "Copper";
|
||||
return "";
|
||||
}
|
||||
|
||||
function detectWavelength(text: string): string {
|
||||
const m = text.match(/(\d{3,4})\s*nm/i);
|
||||
return m ? m[1] : "";
|
||||
}
|
||||
|
||||
function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product[] {
|
||||
const products: Product[] = [];
|
||||
const seen = new Set<string>();
|
||||
const collapsed = html.replace(/\s+/g, " ");
|
||||
|
||||
// Strategy 1: Shopify product grid (class="product-item" or similar)
|
||||
for (const m of collapsed.matchAll(/<(?:li|div)[^>]+class="[^"]*(?:product-item|product__card|grid__item|product-card)[^"]*"[^>]*>([\s\S]*?)<\/(?:li|div)>/gi)) {
|
||||
const card = m[1];
|
||||
const urlMatch = card.match(/href="(\/products\/[^"?#]+)"/i) ||
|
||||
card.match(/href="(https?:\/\/(?:www\.)?vcelink\.com\/products\/[^"?#]+)"/i);
|
||||
if (!urlMatch) continue;
|
||||
const rawUrl = urlMatch[1];
|
||||
const url = rawUrl.startsWith("http") ? rawUrl : BASE + rawUrl;
|
||||
if (seen.has(url)) continue;
|
||||
seen.add(url);
|
||||
|
||||
const nameMatch = card.match(/<h[23456][^>]*>([^<]{8,})<\/h[23456]>/i) ||
|
||||
card.match(/class="[^"]*(?:title|name)[^"]*"[^>]*>([^<]{8,})</i) ||
|
||||
card.match(/aria-label="([^"]{8,})"/i);
|
||||
if (!nameMatch) continue;
|
||||
const name = nameMatch[1].trim().replace(/&/g, "&").replace(/&#\d+;/g, "");
|
||||
if (name.length < 5 || !/sfp|qsfp|transceiver|dac|aoc|optic/i.test(name + url)) continue;
|
||||
|
||||
const priceMatch = card.match(/\$\s*([\d,]+\.?\d*)/);
|
||||
const price = priceMatch ? parseFloat(priceMatch[1].replace(/,/g, "")) : undefined;
|
||||
|
||||
const reach = detectReach(name);
|
||||
const partNumber = name.split(/\s+(?:compatible|for\s+[A-Z])/i)[0]?.trim().slice(0, 80) || name.slice(0, 60);
|
||||
|
||||
products.push({
|
||||
partNumber, name, url,
|
||||
price: price && price > 0 && price < 100000 ? price : undefined,
|
||||
formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps,
|
||||
reachLabel: reach?.label, reachMeters: reach?.meters,
|
||||
fiberType: detectFiber(name), wavelength: detectWavelength(name),
|
||||
});
|
||||
}
|
||||
|
||||
// Strategy 2: JSON-LD product data (Shopify embeds this)
|
||||
if (products.length === 0) {
|
||||
for (const m of collapsed.matchAll(/"@type"\s*:\s*"Product"[^}]*"name"\s*:\s*"([^"]{8,})"[^}]*"url"\s*:\s*"([^"]+)"[^}]*"price"\s*:\s*"?([\d.]+)/gi)) {
|
||||
const name = m[1].replace(/\\n/g, " ").trim();
|
||||
const url = m[2].startsWith("http") ? m[2] : BASE + m[2];
|
||||
const price = parseFloat(m[3]);
|
||||
if (seen.has(url)) continue;
|
||||
if (!/sfp|qsfp|transceiver|dac|aoc|optic/i.test(name + url)) continue;
|
||||
seen.add(url);
|
||||
|
||||
const reach = detectReach(name);
|
||||
products.push({
|
||||
partNumber: name.split(/\s+/)[0]?.slice(0, 80) || "",
|
||||
name, url,
|
||||
price: price > 0 && price < 100000 ? price : undefined,
|
||||
formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps,
|
||||
reachLabel: reach?.label, reachMeters: reach?.meters,
|
||||
fiberType: detectFiber(name), wavelength: detectWavelength(name),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Strategy 3: Generic product links
|
||||
if (products.length === 0) {
|
||||
for (const m of collapsed.matchAll(/href="(\/products\/[^"?#]{5,})"[^>]*>([^<]{8,})</gi)) {
|
||||
const url = BASE + m[1];
|
||||
const name = m[2].trim().replace(/&/g, "&");
|
||||
if (seen.has(url) || name.length < 8) continue;
|
||||
if (!/transceiver|sfp|qsfp|dac|aoc|optic/i.test(name + url)) continue;
|
||||
seen.add(url);
|
||||
|
||||
const idx = collapsed.indexOf(m[1]);
|
||||
const ctx = collapsed.slice(Math.max(0, idx - 200), idx + 400);
|
||||
const priceM = ctx.match(/\$\s*([\d,]+\.?\d*)/);
|
||||
const price = priceM ? parseFloat(priceM[1].replace(/,/g, "")) : undefined;
|
||||
const reach = detectReach(name);
|
||||
|
||||
products.push({
|
||||
partNumber: name.split(/\s+/)[0]?.slice(0, 80) || "",
|
||||
name, url,
|
||||
price: price && price > 0 && price < 100000 ? price : undefined,
|
||||
formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps,
|
||||
reachLabel: reach?.label, reachMeters: reach?.meters,
|
||||
fiberType: detectFiber(name), wavelength: detectWavelength(name),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return products;
|
||||
}
|
||||
|
||||
async function fetchPage(url: string): Promise<string> {
|
||||
const resp = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(30000) });
|
||||
if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`);
|
||||
return resp.text();
|
||||
}
|
||||
|
||||
export async function scrapeVcelink(): Promise<void> {
|
||||
console.log("=== Vcelink Scraper Starting ===\n");
|
||||
|
||||
const vendorId = await ensureVendor(
|
||||
"Vcelink",
|
||||
"compatible",
|
||||
"https://www.vcelink.com",
|
||||
"https://www.vcelink.com/collections/all",
|
||||
);
|
||||
|
||||
let totalProducts = 0;
|
||||
let priceUpdates = 0;
|
||||
const seenCategories = new Set<string>();
|
||||
|
||||
for (const cat of CATEGORIES) {
|
||||
console.log(`\n--- ${cat.formFactor} (${cat.speed}) [${cat.path}] ---`);
|
||||
|
||||
try {
|
||||
const html1 = await fetchPage(BASE + cat.path);
|
||||
const catProducts = parseProductList(html1, cat);
|
||||
|
||||
if (cat.path.includes("/collections/all") && seenCategories.size > 3) {
|
||||
console.log(` Skipping generic fallback (${seenCategories.size} categories scraped)`);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (catProducts.length === 0) {
|
||||
console.log(" No products on page 1 — skipping");
|
||||
continue;
|
||||
}
|
||||
|
||||
seenCategories.add(cat.path);
|
||||
console.log(` Found ${catProducts.length} products on page 1`);
|
||||
|
||||
const allProducts = [...catProducts];
|
||||
|
||||
for (let page = 2; page <= MAX_PAGES; page++) {
|
||||
await sleep(2000);
|
||||
try {
|
||||
// Shopify pagination: ?page=N
|
||||
const pageUrl = `${BASE}${cat.path}?page=${page}`;
|
||||
const html = await fetchPage(pageUrl);
|
||||
const pageProds = parseProductList(html, cat);
|
||||
if (pageProds.length === 0) break;
|
||||
allProducts.push(...pageProds);
|
||||
console.log(` Page ${page}: ${pageProds.length} products`);
|
||||
} catch (err) {
|
||||
console.warn(` Page ${page} failed: ${(err as Error).message.slice(0, 60)}`);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
const uniqueProducts = allProducts.filter((p, i, arr) => arr.findIndex(x => x.url === p.url) === i);
|
||||
console.log(` Total unique: ${uniqueProducts.length}`);
|
||||
|
||||
for (const product of uniqueProducts) {
|
||||
try {
|
||||
const txId = await findOrCreateScrapedTransceiver({
|
||||
partNumber: product.partNumber,
|
||||
vendorId,
|
||||
formFactor: product.formFactor,
|
||||
speedGbps: product.speedGbps,
|
||||
speed: product.speed,
|
||||
reachMeters: product.reachMeters,
|
||||
reachLabel: product.reachLabel,
|
||||
fiberType: product.fiberType,
|
||||
wavelengths: product.wavelength,
|
||||
category: "DataCenter",
|
||||
});
|
||||
|
||||
if (product.price && product.price > 0) {
|
||||
const hash = contentHash({ price: product.price, part: product.partNumber });
|
||||
const updated = await upsertPriceObservation({
|
||||
transceiverId: txId,
|
||||
sourceVendorId: vendorId,
|
||||
price: product.price,
|
||||
currency: "USD",
|
||||
stockLevel: "in_stock",
|
||||
url: product.url,
|
||||
contentHash: hash,
|
||||
});
|
||||
if (updated) priceUpdates++;
|
||||
}
|
||||
totalProducts++;
|
||||
} catch (err) {
|
||||
console.warn(` DB error: ${(err as Error).message.slice(0, 80)}`);
|
||||
}
|
||||
}
|
||||
} catch (err) {
|
||||
console.error(` Category failed: ${(err as Error).message}`);
|
||||
}
|
||||
|
||||
await sleep(2000);
|
||||
}
|
||||
|
||||
console.log(`\n=== Vcelink Complete: ${totalProducts} products, ${priceUpdates} price updates ===`);
|
||||
}
|
||||
|
||||
if (require.main === module) {
|
||||
scrapeVcelink()
|
||||
.then(() => pool.end())
|
||||
.catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); });
|
||||
}
|
||||
@ -18,28 +18,6 @@ export const pool = new Pool({
|
||||
// Alias — some scrapers import { db } instead of { pool }
|
||||
export const db = pool;
|
||||
|
||||
/**
|
||||
* After any verified flag is set, check if all 4 criteria are met and promote
|
||||
* the transceiver to fully_verified. Call this wherever price/image/details/
|
||||
* competitor_verified are written so the counter stays consistent.
|
||||
*/
|
||||
export async function checkAndSetFullyVerified(transceiverId: string): Promise<boolean> {
|
||||
const result = await pool.query(
|
||||
`UPDATE transceivers
|
||||
SET fully_verified = true,
|
||||
fully_verified_at = COALESCE(fully_verified_at, NOW())
|
||||
WHERE id = $1
|
||||
AND price_verified = true
|
||||
AND image_verified = true
|
||||
AND details_verified = true
|
||||
AND competitor_verified = true
|
||||
AND (fully_verified IS NULL OR fully_verified = false)
|
||||
RETURNING id`,
|
||||
[transceiverId]
|
||||
);
|
||||
return (result.rowCount ?? 0) > 0;
|
||||
}
|
||||
|
||||
// Per-form-factor price bounds [min, max] in USD equivalent
|
||||
const PRICE_BOUNDS: Record<string, [number, number]> = {
|
||||
"SFP": [2, 3000],
|
||||
@ -89,12 +67,6 @@ export async function upsertPriceObservation(params: {
|
||||
: params.currency === "GBP" ? params.price * 1.27
|
||||
: params.price;
|
||||
|
||||
// Hard floor: no transceiver of any type can cost less than $1.50 — catches accessories/cables
|
||||
// misidentified as transceivers (e.g. FS-XXXXX DAC cables scraped as OSFP/QSFP28)
|
||||
if (priceUsd < 1.5) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const anomalous = await isPriceAnomalous(params.transceiverId, priceUsd);
|
||||
if (anomalous) {
|
||||
return false; // Reject price outside form-factor bounds
|
||||
@ -124,7 +96,6 @@ export async function upsertPriceObservation(params: {
|
||||
WHERE id = $1 AND (price_verified IS NULL OR price_verified = false OR ${isCompetitor ? "competitor_verified IS NULL OR competitor_verified = false" : "false"})`,
|
||||
[params.transceiverId]
|
||||
);
|
||||
await checkAndSetFullyVerified(params.transceiverId);
|
||||
return false; // No change
|
||||
}
|
||||
|
||||
@ -160,121 +131,9 @@ export async function upsertPriceObservation(params: {
|
||||
[params.transceiverId]
|
||||
);
|
||||
}
|
||||
await checkAndSetFullyVerified(params.transceiverId);
|
||||
return true; // New observation written
|
||||
}
|
||||
|
||||
/**
|
||||
* Upsert a stock observation with full warehouse breakdown (FS.com v2+).
|
||||
* Writes to stock_observations including DE-Lager, Global-Lager, Nachlieferung,
|
||||
* units_sold, compatible_brands, price_net, product_url, plus quality metadata:
|
||||
* stock_confidence, price_currency, price_includes_tax, stock_vendor_ts.
|
||||
*
|
||||
* stock_confidence:
|
||||
* 1 = boolean only (in_stock true/false, no unit count)
|
||||
* 2 = aggregated global count (single number, e.g. QSFPTEK "5507 in real-time stock")
|
||||
* 3 = per-warehouse breakdown (e.g. FS.com DE-Lager + Global-Lager split)
|
||||
*
|
||||
* Returns true only when the data has changed since the last observation.
|
||||
*/
|
||||
export async function upsertStockObservation(params: {
|
||||
transceiverId: string;
|
||||
sourceVendorId: string;
|
||||
stockLevel: string;
|
||||
quantityAvailable?: number;
|
||||
warehouseDeQty?: number;
|
||||
warehouseDeDeliveryDate?: string | null;
|
||||
warehouseGlobalQty?: number;
|
||||
warehouseGlobalDeliveryDate?: string | null;
|
||||
backorderQty?: number;
|
||||
backorderEstimatedDate?: string | null;
|
||||
unitsSold?: number;
|
||||
compatibleBrands?: string[];
|
||||
priceNet?: number;
|
||||
productUrl?: string;
|
||||
// Quality metadata (migration 038)
|
||||
stockConfidence?: 1 | 2 | 3;
|
||||
priceCurrency?: string;
|
||||
priceIncludesTax?: boolean;
|
||||
stockVendorTs?: Date | null;
|
||||
}): Promise<boolean> {
|
||||
// Skip if there is genuinely no warehouse data at all
|
||||
// (includes backorderQty so products available only on backorder are recorded)
|
||||
if (
|
||||
params.warehouseDeQty === undefined &&
|
||||
params.warehouseGlobalQty === undefined &&
|
||||
params.quantityAvailable === undefined &&
|
||||
params.backorderQty === undefined
|
||||
) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Compare against the last observation to avoid duplicate writes
|
||||
const lastObs = await pool.query(
|
||||
`SELECT warehouse_de_qty, warehouse_global_qty, backorder_qty, units_sold, quantity_available
|
||||
FROM stock_observations
|
||||
WHERE transceiver_id = $1 AND source_vendor_id = $2
|
||||
ORDER BY time DESC LIMIT 1`,
|
||||
[params.transceiverId, params.sourceVendorId]
|
||||
);
|
||||
|
||||
if (lastObs.rows.length > 0) {
|
||||
const r = lastObs.rows[0];
|
||||
const unchanged =
|
||||
(r.warehouse_de_qty ?? null) === (params.warehouseDeQty ?? null) &&
|
||||
(r.warehouse_global_qty ?? null) === (params.warehouseGlobalQty ?? null) &&
|
||||
(r.backorder_qty ?? null) === (params.backorderQty ?? null) &&
|
||||
(r.units_sold ?? null) === (params.unitsSold ?? null) &&
|
||||
(r.quantity_available ?? null) === (params.quantityAvailable ?? null);
|
||||
if (unchanged) return false;
|
||||
}
|
||||
|
||||
const inStock =
|
||||
((params.warehouseDeQty ?? 0) + (params.warehouseGlobalQty ?? 0) + (params.quantityAvailable ?? 0)) > 0;
|
||||
|
||||
await pool.query(
|
||||
`INSERT INTO stock_observations (
|
||||
time, transceiver_id, source_vendor_id,
|
||||
in_stock, quantity_available,
|
||||
warehouse_de_qty, warehouse_de_delivery_date,
|
||||
warehouse_global_qty, warehouse_global_delivery_date,
|
||||
backorder_qty, backorder_estimated_date,
|
||||
units_sold, compatible_brands, price_net, product_url,
|
||||
stock_confidence, price_currency, price_includes_tax, stock_vendor_ts
|
||||
) VALUES (
|
||||
NOW(), $1, $2,
|
||||
$3, $4,
|
||||
$5, $6::date,
|
||||
$7, $8::date,
|
||||
$9, $10::date,
|
||||
$11, $12, $13, $14,
|
||||
$15, $16, $17, $18
|
||||
)`,
|
||||
[
|
||||
params.transceiverId,
|
||||
params.sourceVendorId,
|
||||
inStock,
|
||||
params.quantityAvailable ?? null,
|
||||
params.warehouseDeQty ?? null,
|
||||
params.warehouseDeDeliveryDate ?? null,
|
||||
params.warehouseGlobalQty ?? null,
|
||||
params.warehouseGlobalDeliveryDate ?? null,
|
||||
params.backorderQty ?? null,
|
||||
params.backorderEstimatedDate ?? null,
|
||||
params.unitsSold ?? null,
|
||||
params.compatibleBrands?.length ? params.compatibleBrands : null,
|
||||
params.priceNet ?? null,
|
||||
params.productUrl ?? null,
|
||||
params.stockConfidence ?? 1,
|
||||
params.priceCurrency ?? null,
|
||||
params.priceIncludesTax ?? null,
|
||||
params.stockVendorTs ?? null,
|
||||
]
|
||||
);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
export async function findOrCreateScrapedTransceiver(params: {
|
||||
partNumber: string;
|
||||
vendorId: string;
|
||||
@ -301,7 +160,6 @@ export async function findOrCreateScrapedTransceiver(params: {
|
||||
`UPDATE transceivers SET image_url = $1, image_verified = true, updated_at = NOW() WHERE id = $2`,
|
||||
[params.imageUrl, existing.rows[0].id]
|
||||
);
|
||||
await checkAndSetFullyVerified(existing.rows[0].id);
|
||||
}
|
||||
return existing.rows[0].id;
|
||||
}
|
||||
|
||||
@ -1,66 +0,0 @@
|
||||
#!/bin/bash
|
||||
# FS.com Scraper — Mac-side runner
|
||||
# Runs from this Mac (residential IP) so FS.com isn't blocked.
|
||||
# Opens SSH tunnel to Erik's DB → runs scraper → closes tunnel.
|
||||
#
|
||||
# Schedule: launchd at 02:00, 10:00, 18:00 daily
|
||||
# Log: ~/Library/Logs/tip-fs-scraper.log
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
LOG="$HOME/Library/Logs/tip-fs-scraper.log"
|
||||
REPO="/Users/renefichtmueller/Desktop/Claude Code/github-repos/transceiver-db"
|
||||
NODE="/opt/homebrew/bin/node"
|
||||
NPX="/opt/homebrew/bin/npx"
|
||||
TUNNEL_PID_FILE="/tmp/tip-db-tunnel.pid"
|
||||
DB_LOCAL_PORT=5433
|
||||
|
||||
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "$LOG"; }
|
||||
|
||||
# ── Open SSH tunnel if not already running ────────────────────────────────────
|
||||
open_tunnel() {
|
||||
if [ -f "$TUNNEL_PID_FILE" ]; then
|
||||
PID=$(cat "$TUNNEL_PID_FILE")
|
||||
if kill -0 "$PID" 2>/dev/null; then
|
||||
log "Tunnel already running (PID $PID)"
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
log "Opening SSH tunnel → Erik PostgreSQL on port $DB_LOCAL_PORT…"
|
||||
ssh -N -f -L "${DB_LOCAL_PORT}:localhost:${DB_LOCAL_PORT}" erik
|
||||
# -f forks to background, no PID tracking needed — use pkill to close
|
||||
log "Tunnel opened"
|
||||
sleep 2 # Give the tunnel a moment to establish
|
||||
}
|
||||
|
||||
close_tunnel() {
|
||||
log "Closing SSH tunnel…"
|
||||
pkill -f "ssh -N -f -L ${DB_LOCAL_PORT}:localhost:${DB_LOCAL_PORT}" 2>/dev/null || true
|
||||
rm -f "$TUNNEL_PID_FILE"
|
||||
}
|
||||
|
||||
# ── Main ──────────────────────────────────────────────────────────────────────
|
||||
mkdir -p "$(dirname "$LOG")"
|
||||
log "=== FS.com Mac Scraper starting ==="
|
||||
|
||||
# Only close tunnel if we opened it (not if one was already running)
|
||||
OPENED_TUNNEL=0
|
||||
if ! pgrep -f "ssh -N.*${DB_LOCAL_PORT}:localhost" >/dev/null 2>&1; then
|
||||
open_tunnel
|
||||
OPENED_TUNNEL=1
|
||||
trap close_tunnel EXIT
|
||||
fi
|
||||
|
||||
cd "$REPO"
|
||||
|
||||
export POSTGRES_HOST=localhost
|
||||
export POSTGRES_PORT=$DB_LOCAL_PORT
|
||||
export POSTGRES_DB=transceiver_db
|
||||
export POSTGRES_USER=tip
|
||||
export POSTGRES_PASSWORD=tip_prod_2026
|
||||
export NODE_ENV=production
|
||||
|
||||
log "Running fs-com scraper via tsx…"
|
||||
"$NPX" tsx packages/scraper/src/scrapers/fs-com.ts 2>&1 | tee -a "$LOG"
|
||||
|
||||
log "=== FS.com Mac Scraper complete ==="
|
||||
@ -1,17 +0,0 @@
|
||||
-- Migration 028: Extend stock_observations with full warehouse breakdown
|
||||
-- Adds FS.com warehouse columns: DE-Lager, Global-Lager, Nachlieferung,
|
||||
-- units_sold, compatible_brands, price_net, product_url.
|
||||
-- NOTE: This migration was applied directly on Erik before being committed to the repo.
|
||||
-- All ADD COLUMN statements use IF NOT EXISTS guards for safe re-application.
|
||||
|
||||
ALTER TABLE stock_observations
|
||||
ADD COLUMN IF NOT EXISTS warehouse_de_qty INTEGER,
|
||||
ADD COLUMN IF NOT EXISTS warehouse_de_delivery_date DATE,
|
||||
ADD COLUMN IF NOT EXISTS warehouse_global_qty INTEGER,
|
||||
ADD COLUMN IF NOT EXISTS warehouse_global_delivery_date DATE,
|
||||
ADD COLUMN IF NOT EXISTS backorder_qty INTEGER,
|
||||
ADD COLUMN IF NOT EXISTS backorder_estimated_date DATE,
|
||||
ADD COLUMN IF NOT EXISTS units_sold BIGINT,
|
||||
ADD COLUMN IF NOT EXISTS compatible_brands TEXT[],
|
||||
ADD COLUMN IF NOT EXISTS price_net NUMERIC(12,4),
|
||||
ADD COLUMN IF NOT EXISTS product_url TEXT;
|
||||
@ -1,7 +0,0 @@
|
||||
-- Migration 034: Add review_tag column to blog_drafts for manual reviewed tracking
|
||||
-- Used by dashboard to let editor mark posts as reviewed before publishing
|
||||
|
||||
ALTER TABLE blog_drafts
|
||||
ADD COLUMN IF NOT EXISTS review_tag VARCHAR(32) DEFAULT NULL;
|
||||
|
||||
COMMENT ON COLUMN blog_drafts.review_tag IS 'Manual review status tag — set to ''reviewed'' when editor has proofread the post, NULL otherwise';
|
||||
@ -1,12 +0,0 @@
|
||||
-- Migration 035: Add is_anomalous column to price_observations
|
||||
-- This column marks price entries as outliers/anomalous that should be excluded from display
|
||||
|
||||
ALTER TABLE price_observations
|
||||
ADD COLUMN IF NOT EXISTS is_anomalous BOOLEAN NOT NULL DEFAULT false;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_price_obs_anomalous
|
||||
ON price_observations (transceiver_id, is_anomalous)
|
||||
WHERE is_anomalous = false;
|
||||
|
||||
COMMENT ON COLUMN price_observations.is_anomalous IS
|
||||
'True when this price is flagged as an outlier/anomaly and should be excluded from price displays and comparisons';
|
||||
@ -1,44 +0,0 @@
|
||||
-- Migration 036: Transceiver equivalences for competitor_verified matching
|
||||
-- Stores semantic equivalences between Flexoptix SKUs and competitor products
|
||||
-- matched by technical specs (form_factor + speed + reach + standard + fiber_type)
|
||||
|
||||
CREATE TABLE IF NOT EXISTS transceiver_equivalences (
|
||||
id UUID DEFAULT gen_random_uuid() PRIMARY KEY,
|
||||
flexoptix_id UUID NOT NULL REFERENCES transceivers(id) ON DELETE CASCADE,
|
||||
competitor_id UUID NOT NULL REFERENCES transceivers(id) ON DELETE CASCADE,
|
||||
confidence DECIMAL(4,3) NOT NULL CHECK (confidence BETWEEN 0 AND 1),
|
||||
match_basis TEXT[] NOT NULL DEFAULT '{}', -- ['standard_name','form_factor','speed_gbps','fiber_type','reach']
|
||||
match_notes TEXT,
|
||||
status VARCHAR(20) NOT NULL DEFAULT 'pending'
|
||||
CHECK (status IN ('pending','approved','rejected','auto_approved')),
|
||||
reviewed_by VARCHAR(200),
|
||||
reviewed_at TIMESTAMPTZ,
|
||||
reject_reason TEXT,
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
UNIQUE (flexoptix_id, competitor_id)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_eq_flexoptix ON transceiver_equivalences (flexoptix_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_eq_competitor ON transceiver_equivalences (competitor_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_eq_status ON transceiver_equivalences (status, confidence DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_eq_pending ON transceiver_equivalences (flexoptix_id) WHERE status = 'pending';
|
||||
|
||||
-- Auto-update updated_at
|
||||
CREATE OR REPLACE FUNCTION update_equivalences_updated_at()
|
||||
RETURNS TRIGGER AS $$
|
||||
BEGIN
|
||||
NEW.updated_at = NOW();
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
DROP TRIGGER IF EXISTS trg_eq_updated_at ON transceiver_equivalences;
|
||||
CREATE TRIGGER trg_eq_updated_at
|
||||
BEFORE UPDATE ON transceiver_equivalences
|
||||
FOR EACH ROW EXECUTE FUNCTION update_equivalences_updated_at();
|
||||
|
||||
COMMENT ON TABLE transceiver_equivalences IS
|
||||
'Semantic equivalences between Flexoptix SKUs and competitor products, '
|
||||
'matched by technical specification overlap. Used to set competitor_verified=true '
|
||||
'on Flexoptix transceivers that have no exact SKU match at competitors.';
|
||||
@ -1,12 +0,0 @@
|
||||
-- Migration 037: Add composite indexes for stock_observations
|
||||
-- Optimises DISTINCT ON (transceiver_id, source_vendor_id) queries used
|
||||
-- by GET /api/stock and GET /api/stock/summary.
|
||||
|
||||
-- Primary lookup index for "latest per (transceiver, vendor)" queries
|
||||
CREATE INDEX IF NOT EXISTS idx_stock_tx_vendor_time
|
||||
ON stock_observations (transceiver_id, source_vendor_id, time DESC);
|
||||
|
||||
-- Partial index for "in-stock only" filter (used by /api/stock?in_stock=true)
|
||||
CREATE INDEX IF NOT EXISTS idx_stock_in_stock
|
||||
ON stock_observations (in_stock, time DESC)
|
||||
WHERE in_stock = true;
|
||||
@ -1,22 +0,0 @@
|
||||
-- Migration 038: Add data-quality columns to stock_observations + selective cleanup
|
||||
--
|
||||
-- stock_confidence: 1 = boolean only (in_stock true/false)
|
||||
-- 2 = aggregated global count (single number, e.g. QSFPTEK)
|
||||
-- 3 = per-warehouse breakdown (e.g. FS.com DE/Global split)
|
||||
-- price_currency: ISO 4217 code, e.g. 'USD', 'EUR', 'GBP'
|
||||
-- price_includes_tax: true = gross price, false = net/excl. VAT
|
||||
-- stock_vendor_ts: timestamp as reported by vendor (e.g. QSFPTEK "17 Apr 2026")
|
||||
|
||||
ALTER TABLE stock_observations
|
||||
ADD COLUMN IF NOT EXISTS stock_confidence SMALLINT DEFAULT 1
|
||||
CHECK (stock_confidence IN (1, 2, 3)),
|
||||
ADD COLUMN IF NOT EXISTS price_currency CHAR(3),
|
||||
ADD COLUMN IF NOT EXISTS price_includes_tax BOOLEAN,
|
||||
ADD COLUMN IF NOT EXISTS stock_vendor_ts TIMESTAMPTZ;
|
||||
|
||||
-- ── Selective cleanup ───────────────────────────────────────────────────────
|
||||
-- Truncate stock_observations: all 186 rows are from the first FS.com test run.
|
||||
-- They will be repopulated automatically at the next scheduled scraper run
|
||||
-- (02:00 / 10:00 / 18:00 via launchd). Transceiver catalog, specs, vendors,
|
||||
-- price_observations, and all other tables are left untouched.
|
||||
TRUNCATE TABLE stock_observations;
|
||||
Loading…
x
Reference in New Issue
Block a user