fix: add fscom url discovery mode

This commit is contained in:
Rene Fichtmueller 2026-05-09 14:00:30 +02:00
parent 7ddc439315
commit 3d79f6b8e0

View File

@ -58,6 +58,7 @@ import {
upsertPriceObservation, upsertPriceObservation,
upsertStockObservation, upsertStockObservation,
findOrCreateScrapedTransceiver, findOrCreateScrapedTransceiver,
markImageVerified,
pool, pool,
} from "../utils/db"; } from "../utils/db";
import { contentHash } from "../utils/hash"; import { contentHash } from "../utils/hash";
@ -73,6 +74,7 @@ const STOCK_FRESH_HOURS = parseInt(process.env["FS_STOCK_FRESH_HOURS"] ?? "12",
const FORCE_REVALIDATE = process.env["TIP_FORCE_REVALIDATE"] === "1"; const FORCE_REVALIDATE = process.env["TIP_FORCE_REVALIDATE"] === "1";
const ONLY_MISSING_IMAGES = process.env["FS_ONLY_MISSING_IMAGES"] === "1"; const ONLY_MISSING_IMAGES = process.env["FS_ONLY_MISSING_IMAGES"] === "1";
const DB_DETAIL_ONLY = process.env["FS_DB_DETAIL_ONLY"] === "1"; const DB_DETAIL_ONLY = process.env["FS_DB_DETAIL_ONLY"] === "1";
const URL_DISCOVERY_ONLY = process.env["FS_URL_DISCOVERY_ONLY"] === "1";
const PROXY_URLS = (process.env["PROXY_URLS"] ?? "") const PROXY_URLS = (process.env["PROXY_URLS"] ?? "")
.split(",") .split(",")
@ -262,6 +264,7 @@ interface ProductSummary {
url: string; url: string;
name: string; name: string;
partNumber: string; partNumber: string;
targetTransceiverId?: string;
} }
interface ProductDetail extends ProductSummary { interface ProductDetail extends ProductSummary {
@ -394,7 +397,7 @@ async function collectProductUrls(
// ── Phase 2: Scrape product detail pages ────────────────────────────────────── // ── Phase 2: Scrape product detail pages ──────────────────────────────────────
async function scrapeProductDetails( async function scrapeProductDetails(
requests: Array<{ url: string; userData: { name: string; partNumber: string } }>, requests: Array<{ url: string; userData: { name: string; partNumber: string; targetTransceiverId?: string } }>,
proxyConfiguration: ProxyConfiguration | undefined proxyConfiguration: ProxyConfiguration | undefined
): Promise<ProductDetail[]> { ): Promise<ProductDetail[]> {
// Purge Phase 2 storage so it starts with a clean request queue // Purge Phase 2 storage so it starts with a clean request queue
@ -425,7 +428,9 @@ async function scrapeProductDetails(
const { name: listingName, partNumber: listingPn } = request.userData as { const { name: listingName, partNumber: listingPn } = request.userData as {
name: string; name: string;
partNumber: string; partNumber: string;
targetTransceiverId?: string;
}; };
const { targetTransceiverId } = request.userData as { targetTransceiverId?: string };
const url = request.url; const url = request.url;
try { try {
@ -732,6 +737,7 @@ async function scrapeProductDetails(
specs: raw.specs, specs: raw.specs,
imageUrl: resolveUrl(raw.imageUrl), imageUrl: resolveUrl(raw.imageUrl),
datasheetUrl: resolveUrl(raw.datasheetUrl), datasheetUrl: resolveUrl(raw.datasheetUrl),
targetTransceiverId,
}); });
}, },
}, makeCrawleeConfig("fs-phase2")); }, makeCrawleeConfig("fs-phase2"));
@ -789,11 +795,34 @@ export async function scrapeFs(): Promise<void> {
// ── Phase 1: Discover product URLs ───────────────────────────────────────── // ── Phase 1: Discover product URLs ─────────────────────────────────────────
let productMap: Map<string, ProductSummary>; let productMap: Map<string, ProductSummary>;
if (DB_DETAIL_ONLY) { if (URL_DISCOVERY_ONLY) {
console.log("\n[Phase 1] URL discovery mode — probing FS.COM rows without product URLs…");
const dbRows = await pool.query(
`
SELECT t.id, t.part_number
FROM transceivers t
JOIN vendors v ON v.id = t.vendor_id
WHERE v.name = 'FS.COM'
AND COALESCE(t.product_page_url, '') = ''
AND t.part_number ~ '^FS-[0-9]+$'
ORDER BY t.part_number
LIMIT $1
`,
[MAX_DETAIL_PAGES_PER_RUN]
);
productMap = new Map(
dbRows.rows.map((row) => {
const partNumber = row.part_number as string;
const productId = partNumber.replace(/^FS-/, "");
const url = `${BASE_URL}/products/${productId}.html`;
return [url, { url, name: partNumber, partNumber, targetTransceiverId: row.id as string }];
})
);
} else if (DB_DETAIL_ONLY) {
console.log("\n[Phase 1] DB detail-only mode — using existing FS.COM product URLs with missing verification signals…"); console.log("\n[Phase 1] DB detail-only mode — using existing FS.COM product URLs with missing verification signals…");
const dbRows = await pool.query( const dbRows = await pool.query(
` `
SELECT t.part_number, t.product_page_url SELECT t.id, t.part_number, t.product_page_url
FROM transceivers t FROM transceivers t
JOIN vendors v ON v.id = t.vendor_id JOIN vendors v ON v.id = t.vendor_id
WHERE v.name = 'FS.COM' WHERE v.name = 'FS.COM'
@ -820,7 +849,7 @@ export async function scrapeFs(): Promise<void> {
dbRows.rows.map((row) => { dbRows.rows.map((row) => {
const url = normalizeFsProductUrl(row.product_page_url as string); const url = normalizeFsProductUrl(row.product_page_url as string);
const partNumber = row.part_number as string; const partNumber = row.part_number as string;
return [url, { url, name: partNumber, partNumber }]; return [url, { url, name: partNumber, partNumber, targetTransceiverId: row.id as string }];
}) })
); );
} else { } else {
@ -889,7 +918,14 @@ export async function scrapeFs(): Promise<void> {
// ── Phase 2: Scrape detail pages ──────────────────────────────────────────── // ── Phase 2: Scrape detail pages ────────────────────────────────────────────
const detailRequests = urlsToScrape.map((url) => { const detailRequests = urlsToScrape.map((url) => {
const s = productMap.get(url); const s = productMap.get(url);
return { url, userData: { name: s?.name ?? "FS.com Product", partNumber: s?.partNumber ?? "" } }; return {
url,
userData: {
name: s?.name ?? "FS.com Product",
partNumber: s?.partNumber ?? "",
targetTransceiverId: s?.targetTransceiverId,
},
};
}); });
const details = await scrapeProductDetails(detailRequests, proxyConfiguration); const details = await scrapeProductDetails(detailRequests, proxyConfiguration);
@ -911,20 +947,50 @@ export async function scrapeFs(): Promise<void> {
const textForInference = `${detail.name} ${detail.partNumber} ${Object.values(detail.specs).join(" ")}`; const textForInference = `${detail.name} ${detail.partNumber} ${Object.values(detail.specs).join(" ")}`;
const fiberType = parsed.fiberType ?? detectFiberType(textForInference); const fiberType = parsed.fiberType ?? detectFiberType(textForInference);
const transceiverId = await findOrCreateScrapedTransceiver({ const transceiverId = detail.targetTransceiverId ?? (await findOrCreateScrapedTransceiver({
partNumber: detail.partNumber, partNumber: detail.partNumber,
vendorId, vendorId,
productUrl: detail.url, productUrl: detail.url,
formFactor: ff, formFactor: ff,
speedGbps: speedInfo?.speedGbps, speedGbps: speedInfo?.speedGbps,
speed: speedInfo?.speed, speed: speedInfo?.speed,
reachLabel: reach ?? parsed.reachLabel, reachLabel: reach ?? parsed.reachLabel,
reachMeters: parsed.reachMeters, reachMeters: parsed.reachMeters,
fiberType, fiberType,
wavelengths: parsed.wavelengths, wavelengths: parsed.wavelengths,
imageUrl: detail.imageUrl, imageUrl: detail.imageUrl,
category: "DataCenter", category: "DataCenter",
}); }));
if (detail.targetTransceiverId) {
await pool.query(
`UPDATE transceivers
SET product_page_url = COALESCE(NULLIF(product_page_url, ''), $2),
form_factor = COALESCE(NULLIF(form_factor, ''), $3),
speed_gbps = CASE WHEN speed_gbps IS NULL OR speed_gbps = 0 THEN COALESCE($4, speed_gbps) ELSE speed_gbps END,
speed = COALESCE(NULLIF(speed, ''), $5),
reach_label = COALESCE(NULLIF(reach_label, ''), $6),
reach_meters = CASE WHEN reach_meters IS NULL OR reach_meters = 0 THEN COALESCE($7, reach_meters) ELSE reach_meters END,
fiber_type = COALESCE(NULLIF(fiber_type, ''), $8),
wavelengths = COALESCE(NULLIF(wavelengths, ''), $9),
updated_at = NOW()
WHERE id = $1`,
[
transceiverId,
detail.url,
ff,
speedInfo?.speedGbps ?? null,
speedInfo?.speed ?? null,
reach ?? parsed.reachLabel ?? null,
parsed.reachMeters ?? null,
fiberType ?? null,
parsed.wavelengths ?? null,
]
);
if (detail.imageUrl) {
await markImageVerified(transceiverId, detail.imageUrl);
}
}
const stockLevel = deriveStockLevel(detail.deQty, detail.globalQty, detail.backorderQty); const stockLevel = deriveStockLevel(detail.deQty, detail.globalQty, detail.backorderQty);
const totalQty = (detail.deQty ?? 0) + (detail.globalQty ?? 0); const totalQty = (detail.deQty ?? 0) + (detail.globalQty ?? 0);
@ -970,14 +1036,18 @@ export async function scrapeFs(): Promise<void> {
}); });
if (stockNew) stockWritten++; if (stockNew) stockWritten++;
if (Object.keys(detail.specs).length > 0) { const hasSourceDetails =
Object.keys(detail.specs).length > 0 ||
Boolean(fiberType || parsed.connector || parsed.wavelengths || parsed.reachLabel || reach);
if (hasSourceDetails) {
const updated = await updateVerifiedSpecs({ const updated = await updateVerifiedSpecs({
transceiverId, transceiverId,
fiberType, fiberType,
connector: parsed.connector, connector: parsed.connector,
wavelengths: parsed.wavelengths, wavelengths: parsed.wavelengths,
reachMeters: parsed.reachMeters, reachMeters: parsed.reachMeters,
reachLabel: parsed.reachLabel, reachLabel: reach ?? parsed.reachLabel,
powerConsumptionW: parsed.powerConsumptionW, powerConsumptionW: parsed.powerConsumptionW,
tempRange: parsed.tempRange, tempRange: parsed.tempRange,
modulation: parsed.modulation, modulation: parsed.modulation,