fix: add fscom url discovery mode
This commit is contained in:
parent
7ddc439315
commit
3d79f6b8e0
@ -58,6 +58,7 @@ import {
|
||||
upsertPriceObservation,
|
||||
upsertStockObservation,
|
||||
findOrCreateScrapedTransceiver,
|
||||
markImageVerified,
|
||||
pool,
|
||||
} from "../utils/db";
|
||||
import { contentHash } from "../utils/hash";
|
||||
@ -73,6 +74,7 @@ const STOCK_FRESH_HOURS = parseInt(process.env["FS_STOCK_FRESH_HOURS"] ?? "12",
|
||||
const FORCE_REVALIDATE = process.env["TIP_FORCE_REVALIDATE"] === "1";
|
||||
const ONLY_MISSING_IMAGES = process.env["FS_ONLY_MISSING_IMAGES"] === "1";
|
||||
const DB_DETAIL_ONLY = process.env["FS_DB_DETAIL_ONLY"] === "1";
|
||||
const URL_DISCOVERY_ONLY = process.env["FS_URL_DISCOVERY_ONLY"] === "1";
|
||||
|
||||
const PROXY_URLS = (process.env["PROXY_URLS"] ?? "")
|
||||
.split(",")
|
||||
@ -262,6 +264,7 @@ interface ProductSummary {
|
||||
url: string;
|
||||
name: string;
|
||||
partNumber: string;
|
||||
targetTransceiverId?: string;
|
||||
}
|
||||
|
||||
interface ProductDetail extends ProductSummary {
|
||||
@ -394,7 +397,7 @@ async function collectProductUrls(
|
||||
// ── Phase 2: Scrape product detail pages ──────────────────────────────────────
|
||||
|
||||
async function scrapeProductDetails(
|
||||
requests: Array<{ url: string; userData: { name: string; partNumber: string } }>,
|
||||
requests: Array<{ url: string; userData: { name: string; partNumber: string; targetTransceiverId?: string } }>,
|
||||
proxyConfiguration: ProxyConfiguration | undefined
|
||||
): Promise<ProductDetail[]> {
|
||||
// Purge Phase 2 storage so it starts with a clean request queue
|
||||
@ -425,7 +428,9 @@ async function scrapeProductDetails(
|
||||
const { name: listingName, partNumber: listingPn } = request.userData as {
|
||||
name: string;
|
||||
partNumber: string;
|
||||
targetTransceiverId?: string;
|
||||
};
|
||||
const { targetTransceiverId } = request.userData as { targetTransceiverId?: string };
|
||||
const url = request.url;
|
||||
|
||||
try {
|
||||
@ -732,6 +737,7 @@ async function scrapeProductDetails(
|
||||
specs: raw.specs,
|
||||
imageUrl: resolveUrl(raw.imageUrl),
|
||||
datasheetUrl: resolveUrl(raw.datasheetUrl),
|
||||
targetTransceiverId,
|
||||
});
|
||||
},
|
||||
}, makeCrawleeConfig("fs-phase2"));
|
||||
@ -789,11 +795,34 @@ export async function scrapeFs(): Promise<void> {
|
||||
|
||||
// ── Phase 1: Discover product URLs ─────────────────────────────────────────
|
||||
let productMap: Map<string, ProductSummary>;
|
||||
if (DB_DETAIL_ONLY) {
|
||||
if (URL_DISCOVERY_ONLY) {
|
||||
console.log("\n[Phase 1] URL discovery mode — probing FS.COM rows without product URLs…");
|
||||
const dbRows = await pool.query(
|
||||
`
|
||||
SELECT t.id, t.part_number
|
||||
FROM transceivers t
|
||||
JOIN vendors v ON v.id = t.vendor_id
|
||||
WHERE v.name = 'FS.COM'
|
||||
AND COALESCE(t.product_page_url, '') = ''
|
||||
AND t.part_number ~ '^FS-[0-9]+$'
|
||||
ORDER BY t.part_number
|
||||
LIMIT $1
|
||||
`,
|
||||
[MAX_DETAIL_PAGES_PER_RUN]
|
||||
);
|
||||
productMap = new Map(
|
||||
dbRows.rows.map((row) => {
|
||||
const partNumber = row.part_number as string;
|
||||
const productId = partNumber.replace(/^FS-/, "");
|
||||
const url = `${BASE_URL}/products/${productId}.html`;
|
||||
return [url, { url, name: partNumber, partNumber, targetTransceiverId: row.id as string }];
|
||||
})
|
||||
);
|
||||
} else if (DB_DETAIL_ONLY) {
|
||||
console.log("\n[Phase 1] DB detail-only mode — using existing FS.COM product URLs with missing verification signals…");
|
||||
const dbRows = await pool.query(
|
||||
`
|
||||
SELECT t.part_number, t.product_page_url
|
||||
SELECT t.id, t.part_number, t.product_page_url
|
||||
FROM transceivers t
|
||||
JOIN vendors v ON v.id = t.vendor_id
|
||||
WHERE v.name = 'FS.COM'
|
||||
@ -820,7 +849,7 @@ export async function scrapeFs(): Promise<void> {
|
||||
dbRows.rows.map((row) => {
|
||||
const url = normalizeFsProductUrl(row.product_page_url as string);
|
||||
const partNumber = row.part_number as string;
|
||||
return [url, { url, name: partNumber, partNumber }];
|
||||
return [url, { url, name: partNumber, partNumber, targetTransceiverId: row.id as string }];
|
||||
})
|
||||
);
|
||||
} else {
|
||||
@ -889,7 +918,14 @@ export async function scrapeFs(): Promise<void> {
|
||||
// ── Phase 2: Scrape detail pages ────────────────────────────────────────────
|
||||
const detailRequests = urlsToScrape.map((url) => {
|
||||
const s = productMap.get(url);
|
||||
return { url, userData: { name: s?.name ?? "FS.com Product", partNumber: s?.partNumber ?? "" } };
|
||||
return {
|
||||
url,
|
||||
userData: {
|
||||
name: s?.name ?? "FS.com Product",
|
||||
partNumber: s?.partNumber ?? "",
|
||||
targetTransceiverId: s?.targetTransceiverId,
|
||||
},
|
||||
};
|
||||
});
|
||||
|
||||
const details = await scrapeProductDetails(detailRequests, proxyConfiguration);
|
||||
@ -911,7 +947,7 @@ export async function scrapeFs(): Promise<void> {
|
||||
const textForInference = `${detail.name} ${detail.partNumber} ${Object.values(detail.specs).join(" ")}`;
|
||||
const fiberType = parsed.fiberType ?? detectFiberType(textForInference);
|
||||
|
||||
const transceiverId = await findOrCreateScrapedTransceiver({
|
||||
const transceiverId = detail.targetTransceiverId ?? (await findOrCreateScrapedTransceiver({
|
||||
partNumber: detail.partNumber,
|
||||
vendorId,
|
||||
productUrl: detail.url,
|
||||
@ -924,7 +960,37 @@ export async function scrapeFs(): Promise<void> {
|
||||
wavelengths: parsed.wavelengths,
|
||||
imageUrl: detail.imageUrl,
|
||||
category: "DataCenter",
|
||||
});
|
||||
}));
|
||||
|
||||
if (detail.targetTransceiverId) {
|
||||
await pool.query(
|
||||
`UPDATE transceivers
|
||||
SET product_page_url = COALESCE(NULLIF(product_page_url, ''), $2),
|
||||
form_factor = COALESCE(NULLIF(form_factor, ''), $3),
|
||||
speed_gbps = CASE WHEN speed_gbps IS NULL OR speed_gbps = 0 THEN COALESCE($4, speed_gbps) ELSE speed_gbps END,
|
||||
speed = COALESCE(NULLIF(speed, ''), $5),
|
||||
reach_label = COALESCE(NULLIF(reach_label, ''), $6),
|
||||
reach_meters = CASE WHEN reach_meters IS NULL OR reach_meters = 0 THEN COALESCE($7, reach_meters) ELSE reach_meters END,
|
||||
fiber_type = COALESCE(NULLIF(fiber_type, ''), $8),
|
||||
wavelengths = COALESCE(NULLIF(wavelengths, ''), $9),
|
||||
updated_at = NOW()
|
||||
WHERE id = $1`,
|
||||
[
|
||||
transceiverId,
|
||||
detail.url,
|
||||
ff,
|
||||
speedInfo?.speedGbps ?? null,
|
||||
speedInfo?.speed ?? null,
|
||||
reach ?? parsed.reachLabel ?? null,
|
||||
parsed.reachMeters ?? null,
|
||||
fiberType ?? null,
|
||||
parsed.wavelengths ?? null,
|
||||
]
|
||||
);
|
||||
if (detail.imageUrl) {
|
||||
await markImageVerified(transceiverId, detail.imageUrl);
|
||||
}
|
||||
}
|
||||
|
||||
const stockLevel = deriveStockLevel(detail.deQty, detail.globalQty, detail.backorderQty);
|
||||
const totalQty = (detail.deQty ?? 0) + (detail.globalQty ?? 0);
|
||||
@ -970,14 +1036,18 @@ export async function scrapeFs(): Promise<void> {
|
||||
});
|
||||
if (stockNew) stockWritten++;
|
||||
|
||||
if (Object.keys(detail.specs).length > 0) {
|
||||
const hasSourceDetails =
|
||||
Object.keys(detail.specs).length > 0 ||
|
||||
Boolean(fiberType || parsed.connector || parsed.wavelengths || parsed.reachLabel || reach);
|
||||
|
||||
if (hasSourceDetails) {
|
||||
const updated = await updateVerifiedSpecs({
|
||||
transceiverId,
|
||||
fiberType,
|
||||
connector: parsed.connector,
|
||||
wavelengths: parsed.wavelengths,
|
||||
reachMeters: parsed.reachMeters,
|
||||
reachLabel: parsed.reachLabel,
|
||||
reachLabel: reach ?? parsed.reachLabel,
|
||||
powerConsumptionW: parsed.powerConsumptionW,
|
||||
tempRange: parsed.tempRange,
|
||||
modulation: parsed.modulation,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user