feat: fs.com scraper Phase 2 — crawl product detail pages for verified specs
- New spec-updater utility: parseSpecTable() + updateVerifiedSpecs() - fs.com scraper now has 2 phases: Phase 1: Category pages → prices + stock (existing) Phase 2: Product detail pages → fiber_type, connector, wavelength, power, image, datasheet - Updates data_confidence from 'enriched_estimated' to 'scraped_unverified' - Processes up to 200 product pages per scraper run
This commit is contained in:
parent
98a7e12282
commit
1daf54e68e
@ -9,6 +9,7 @@
|
||||
import { PlaywrightCrawler } from "crawlee";
|
||||
import { ensureVendor, upsertPriceObservation, findOrCreateScrapedTransceiver, pool } from "../utils/db";
|
||||
import { contentHash, parsePrice, parseStockLevel, parseQuantity } from "../utils/hash";
|
||||
import { updateVerifiedSpecs, parseSpecTable } from "../utils/spec-updater";
|
||||
|
||||
const BASE_URL = "https://www.fs.com";
|
||||
|
||||
@ -270,6 +271,141 @@ export async function scrapeFs(): Promise<void> {
|
||||
}
|
||||
|
||||
console.log(`\nDatabase: ${written} new, ${skipped} unchanged (${uniqueProducts.size} unique)`);
|
||||
|
||||
// ═══ PHASE 2: Crawl product detail pages for VERIFIED specs ═══
|
||||
console.log("\n=== Phase 2: Scraping product detail pages for verified specs ===");
|
||||
|
||||
// Get products that need spec verification (enriched_estimated or missing key fields)
|
||||
const needsSpecs = await pool.query(
|
||||
`SELECT t.id, t.part_number, t.slug FROM transceivers t
|
||||
JOIN vendors v ON t.vendor_id = v.id
|
||||
WHERE v.slug = 'fs-com'
|
||||
AND (t.data_confidence = 'enriched_estimated' OR t.data_confidence = 'unknown'
|
||||
OR t.connector IS NULL OR t.connector = '' OR t.connector = '-'
|
||||
OR t.wavelengths IS NULL OR t.wavelengths = ''
|
||||
OR t.fiber_type IS NULL OR t.fiber_type = '')
|
||||
LIMIT 200`
|
||||
);
|
||||
console.log(`Products needing spec verification: ${needsSpecs.rows.length}`);
|
||||
|
||||
// Build a map of product URLs from our scraped data
|
||||
const productUrls = new Map<string, string>(); // transceiver_id → product URL
|
||||
for (const p of uniqueProducts.values()) {
|
||||
// Find the transceiver in DB by part number
|
||||
const match = await pool.query(
|
||||
`SELECT id FROM transceivers WHERE part_number = $1 AND vendor_id = $2`,
|
||||
[p.partNumber, vendorId]
|
||||
).catch(() => ({ rows: [] }));
|
||||
if (match.rows[0] && p.url) {
|
||||
productUrls.set(match.rows[0].id, p.url);
|
||||
}
|
||||
}
|
||||
|
||||
let specsUpdated = 0;
|
||||
const specCrawler = new PlaywrightCrawler({
|
||||
maxConcurrency: 1,
|
||||
maxRequestsPerMinute: 10,
|
||||
requestHandlerTimeoutSecs: 45,
|
||||
headless: true,
|
||||
launchContext: {
|
||||
launchOptions: {
|
||||
args: ["--disable-blink-features=AutomationControlled", "--lang=en-US"],
|
||||
},
|
||||
},
|
||||
preNavigationHooks: [
|
||||
async ({ page }) => {
|
||||
await page.context().addCookies([
|
||||
{ name: "currency", value: "USD", domain: ".fs.com", path: "/" },
|
||||
{ name: "lang", value: "en", domain: ".fs.com", path: "/" },
|
||||
]);
|
||||
},
|
||||
],
|
||||
async requestHandler({ page, request, log }) {
|
||||
const transceiverId = request.userData?.transceiverId;
|
||||
if (!transceiverId) return;
|
||||
|
||||
log.info(`Spec scrape: ${request.url}`);
|
||||
await page.waitForTimeout(3000);
|
||||
|
||||
// Extract spec table from product detail page
|
||||
const specData = await page.evaluate(() => {
|
||||
const specs: Record<string, string> = {};
|
||||
// fs.com uses various spec table formats
|
||||
const rows = document.querySelectorAll(
|
||||
".product-param tr, .product-specs tr, table.param-table tr, " +
|
||||
".specifications tr, .detail-param tr, .prod-spec-list tr, " +
|
||||
'[class*="specification"] tr, [class*="param"] tr'
|
||||
);
|
||||
for (const row of rows) {
|
||||
const cells = row.querySelectorAll("td, th");
|
||||
if (cells.length >= 2) {
|
||||
const key = (cells[0]?.textContent || "").trim();
|
||||
const val = (cells[1]?.textContent || "").trim();
|
||||
if (key && val && key.length < 100) specs[key] = val;
|
||||
}
|
||||
}
|
||||
|
||||
// Also try dl/dt/dd pattern
|
||||
const dts = document.querySelectorAll("dt, .spec-label, .param-label");
|
||||
for (const dt of dts) {
|
||||
const dd = dt.nextElementSibling;
|
||||
if (dd && (dd.tagName === "DD" || dd.classList.contains("spec-value") || dd.classList.contains("param-value"))) {
|
||||
const key = (dt.textContent || "").trim();
|
||||
const val = (dd.textContent || "").trim();
|
||||
if (key && val) specs[key] = val;
|
||||
}
|
||||
}
|
||||
|
||||
// Extract image
|
||||
const img = document.querySelector('.product-image img, .prod-img img, [class*="gallery"] img, .product-detail img');
|
||||
const imageUrl = img?.getAttribute("src") || "";
|
||||
|
||||
// Extract datasheet link
|
||||
const dsLink = document.querySelector('a[href*="datasheet"], a[href*=".pdf"]');
|
||||
const datasheetUrl = dsLink?.getAttribute("href") || "";
|
||||
|
||||
return { specs, imageUrl, datasheetUrl };
|
||||
});
|
||||
|
||||
if (Object.keys(specData.specs).length > 0) {
|
||||
const parsed = parseSpecTable(specData.specs);
|
||||
const updated = await updateVerifiedSpecs({
|
||||
transceiverId,
|
||||
fiberType: parsed.fiberType,
|
||||
connector: parsed.connector,
|
||||
wavelengths: parsed.wavelengths,
|
||||
reachMeters: parsed.reachMeters,
|
||||
reachLabel: parsed.reachLabel,
|
||||
powerConsumptionW: parsed.powerConsumptionW,
|
||||
tempRange: parsed.tempRange,
|
||||
modulation: parsed.modulation,
|
||||
domSupport: parsed.domSupport,
|
||||
imageUrl: specData.imageUrl ? (specData.imageUrl.startsWith("http") ? specData.imageUrl : `${BASE_URL}${specData.imageUrl}`) : undefined,
|
||||
datasheetUrl: specData.datasheetUrl ? (specData.datasheetUrl.startsWith("http") ? specData.datasheetUrl : `${BASE_URL}${specData.datasheetUrl}`) : undefined,
|
||||
source: "fs.com",
|
||||
});
|
||||
if (updated) specsUpdated++;
|
||||
}
|
||||
},
|
||||
});
|
||||
|
||||
// Build spec crawl requests (limit to 200 per run to avoid rate limiting)
|
||||
const specRequests = needsSpecs.rows
|
||||
.filter(r => productUrls.has(r.id))
|
||||
.slice(0, 200)
|
||||
.map(r => ({
|
||||
url: productUrls.get(r.id)!,
|
||||
userData: { transceiverId: r.id },
|
||||
}));
|
||||
|
||||
if (specRequests.length > 0) {
|
||||
console.log(`Crawling ${specRequests.length} product detail pages for specs...`);
|
||||
await specCrawler.run(specRequests);
|
||||
console.log(`Specs verified: ${specsUpdated} products updated`);
|
||||
} else {
|
||||
console.log("No product URLs available for spec verification this run");
|
||||
}
|
||||
|
||||
console.log("=== FS.com Scraper Complete ===\n");
|
||||
}
|
||||
|
||||
|
||||
186
packages/scraper/src/utils/spec-updater.ts
Normal file
186
packages/scraper/src/utils/spec-updater.ts
Normal file
@ -0,0 +1,186 @@
|
||||
/**
|
||||
* Update transceiver specs with verified data from vendor product pages.
|
||||
* Only updates fields that are currently empty/estimated, preserving vendor-verified data.
|
||||
* Marks updated products as 'scraped_unverified' (higher confidence than 'enriched_estimated').
|
||||
*/
|
||||
import { pool } from "./db";
|
||||
|
||||
export interface VerifiedSpecs {
|
||||
transceiverId: string;
|
||||
fiberType?: string; // SMF, MMF, Copper, AOC
|
||||
connector?: string; // LC, SC, MPO-12, MPO-16, RJ45, DAC, AOC
|
||||
wavelengths?: string; // "850nm", "1310nm", "1310nm (4λ CWDM)", etc.
|
||||
reachMeters?: number;
|
||||
reachLabel?: string;
|
||||
powerConsumptionW?: number;
|
||||
tempRange?: string; // COM, IND
|
||||
modulation?: string; // NRZ, PAM4
|
||||
domSupport?: boolean;
|
||||
imageUrl?: string;
|
||||
datasheetUrl?: string;
|
||||
source: string; // "fs.com", "flexoptix.net", etc.
|
||||
}
|
||||
|
||||
/**
|
||||
* Update transceiver with verified specs from a vendor product page.
|
||||
* Sets data_confidence to 'scraped_unverified' (better than 'enriched_estimated').
|
||||
*/
|
||||
export async function updateVerifiedSpecs(specs: VerifiedSpecs): Promise<boolean> {
|
||||
const updates: string[] = [];
|
||||
const values: any[] = [];
|
||||
let idx = 1;
|
||||
|
||||
// Only update fields that have a new value
|
||||
if (specs.fiberType) {
|
||||
updates.push(`fiber_type = $${idx}`);
|
||||
values.push(specs.fiberType);
|
||||
idx++;
|
||||
}
|
||||
if (specs.connector) {
|
||||
updates.push(`connector = $${idx}`);
|
||||
values.push(specs.connector);
|
||||
idx++;
|
||||
}
|
||||
if (specs.wavelengths) {
|
||||
updates.push(`wavelengths = $${idx}`);
|
||||
values.push(specs.wavelengths);
|
||||
idx++;
|
||||
}
|
||||
if (specs.reachMeters && specs.reachMeters > 0) {
|
||||
updates.push(`reach_meters = $${idx}`);
|
||||
values.push(specs.reachMeters);
|
||||
idx++;
|
||||
}
|
||||
if (specs.reachLabel) {
|
||||
updates.push(`reach_label = $${idx}`);
|
||||
values.push(specs.reachLabel);
|
||||
idx++;
|
||||
}
|
||||
if (specs.powerConsumptionW && specs.powerConsumptionW > 0) {
|
||||
updates.push(`power_consumption_w = $${idx}`);
|
||||
values.push(specs.powerConsumptionW);
|
||||
idx++;
|
||||
}
|
||||
if (specs.tempRange) {
|
||||
updates.push(`temp_range = $${idx}`);
|
||||
values.push(specs.tempRange);
|
||||
idx++;
|
||||
}
|
||||
if (specs.modulation) {
|
||||
updates.push(`modulation = $${idx}`);
|
||||
values.push(specs.modulation);
|
||||
idx++;
|
||||
}
|
||||
if (specs.domSupport !== undefined) {
|
||||
updates.push(`dom_support = $${idx}`);
|
||||
values.push(specs.domSupport);
|
||||
idx++;
|
||||
}
|
||||
if (specs.imageUrl) {
|
||||
updates.push(`image_url = $${idx}, has_image = true`);
|
||||
values.push(specs.imageUrl);
|
||||
idx++;
|
||||
}
|
||||
if (specs.datasheetUrl) {
|
||||
// Use the correct column name based on schema
|
||||
updates.push(`datasheet_r2_key = $${idx}`);
|
||||
values.push(specs.datasheetUrl);
|
||||
idx++;
|
||||
}
|
||||
|
||||
if (updates.length === 0) return false;
|
||||
|
||||
// Always upgrade confidence from estimated to scraped
|
||||
updates.push(`data_confidence = 'scraped_unverified'`);
|
||||
updates.push(`updated_at = NOW()`);
|
||||
|
||||
values.push(specs.transceiverId);
|
||||
await pool.query(
|
||||
`UPDATE transceivers SET ${updates.join(", ")} WHERE id = $${idx}`,
|
||||
values
|
||||
);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse a spec table from a product page into structured data.
|
||||
* Works for fs.com, 10gtek, and similar HTML spec tables.
|
||||
*/
|
||||
export function parseSpecTable(specs: Record<string, string>): Partial<VerifiedSpecs> {
|
||||
const result: Partial<VerifiedSpecs> = {};
|
||||
|
||||
for (const [rawKey, rawVal] of Object.entries(specs)) {
|
||||
const key = rawKey.toLowerCase().trim();
|
||||
const val = rawVal.trim();
|
||||
|
||||
// Fiber Type
|
||||
if (key.includes("fiber") && key.includes("type") || key === "cable type" || key === "media") {
|
||||
if (/single.?mode|smf|os2/i.test(val)) result.fiberType = "SMF";
|
||||
else if (/multi.?mode|mmf|om[1-5]/i.test(val)) result.fiberType = "MMF";
|
||||
else if (/copper|cat[56]/i.test(val)) result.fiberType = "Copper";
|
||||
else if (/aoc|active.optical/i.test(val)) result.fiberType = "AOC";
|
||||
}
|
||||
|
||||
// Connector
|
||||
if (key.includes("connector") || key.includes("interface")) {
|
||||
if (/duplex\s*lc|lc\s*duplex|lc\/pc|lc\/upc|lc\/apc/i.test(val)) result.connector = "LC";
|
||||
else if (/\blc\b/i.test(val)) result.connector = "LC";
|
||||
else if (/sc\/pc|sc\/apc|\bsc\b/i.test(val)) result.connector = "SC";
|
||||
else if (/mpo-?24/i.test(val)) result.connector = "MPO-24";
|
||||
else if (/mpo-?16/i.test(val)) result.connector = "MPO-16";
|
||||
else if (/mpo-?12|mtp-?12|mpo\b|mtp\b/i.test(val)) result.connector = "MPO-12";
|
||||
else if (/rj-?45|copper/i.test(val)) result.connector = "RJ45";
|
||||
else if (/cs\b/i.test(val)) result.connector = "CS";
|
||||
else if (/sn\b/i.test(val)) result.connector = "SN";
|
||||
}
|
||||
|
||||
// Wavelength
|
||||
if (key.includes("wavelength") || key.includes("laser") || key === "tx wavelength") {
|
||||
const nmMatch = val.match(/([\d.]+)\s*nm/i);
|
||||
if (nmMatch) result.wavelengths = nmMatch[1] + "nm";
|
||||
// Check for multi-wavelength
|
||||
if (/cwdm/i.test(val)) result.wavelengths = val;
|
||||
if (/dwdm/i.test(val)) result.wavelengths = val;
|
||||
}
|
||||
|
||||
// Reach / Distance
|
||||
if (key.includes("distance") || key.includes("reach") || key.includes("transmission") || key === "max link length") {
|
||||
const kmMatch = val.match(/([\d.]+)\s*km/i);
|
||||
const mMatch = val.match(/([\d.]+)\s*m\b/i);
|
||||
if (kmMatch) {
|
||||
const km = parseFloat(kmMatch[1]!);
|
||||
result.reachMeters = Math.round(km * 1000);
|
||||
result.reachLabel = km >= 1 ? `${km}km` : `${result.reachMeters}m`;
|
||||
} else if (mMatch) {
|
||||
result.reachMeters = parseInt(mMatch[1]!);
|
||||
result.reachLabel = `${result.reachMeters}m`;
|
||||
}
|
||||
}
|
||||
|
||||
// Power Consumption
|
||||
if (key.includes("power") && (key.includes("consumption") || key.includes("dissipation") || key.includes("max"))) {
|
||||
const wMatch = val.match(/([\d.]+)\s*w/i);
|
||||
if (wMatch) result.powerConsumptionW = parseFloat(wMatch[1]!);
|
||||
}
|
||||
|
||||
// Temperature
|
||||
if (key.includes("temperature") || key.includes("temp") && key.includes("range")) {
|
||||
if (/0.*70|commercial/i.test(val)) result.tempRange = "COM";
|
||||
else if (/-40.*85|industrial/i.test(val)) result.tempRange = "IND";
|
||||
}
|
||||
|
||||
// DOM
|
||||
if (key.includes("dom") || key.includes("ddm") || key.includes("diagnostic")) {
|
||||
result.domSupport = /yes|supported|ddm|dom/i.test(val);
|
||||
}
|
||||
|
||||
// Modulation
|
||||
if (key.includes("modulation") || key.includes("encoding")) {
|
||||
if (/pam4|pam-4/i.test(val)) result.modulation = "PAM4";
|
||||
else if (/nrz/i.test(val)) result.modulation = "NRZ";
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user