feat: fs.com scraper Phase 2 — crawl product detail pages for verified specs

- New spec-updater utility: parseSpecTable() + updateVerifiedSpecs()
- fs.com scraper now has 2 phases:
  Phase 1: Category pages → prices + stock (existing)
  Phase 2: Product detail pages → fiber_type, connector, wavelength, power, image, datasheet
- Updates data_confidence from 'enriched_estimated' to 'scraped_unverified'
- Processes up to 200 product pages per scraper run
This commit is contained in:
Rene Fichtmueller 2026-03-31 09:18:27 +02:00
parent 98a7e12282
commit 1daf54e68e
2 changed files with 322 additions and 0 deletions

View File

@ -9,6 +9,7 @@
import { PlaywrightCrawler } from "crawlee"; import { PlaywrightCrawler } from "crawlee";
import { ensureVendor, upsertPriceObservation, findOrCreateScrapedTransceiver, pool } from "../utils/db"; import { ensureVendor, upsertPriceObservation, findOrCreateScrapedTransceiver, pool } from "../utils/db";
import { contentHash, parsePrice, parseStockLevel, parseQuantity } from "../utils/hash"; import { contentHash, parsePrice, parseStockLevel, parseQuantity } from "../utils/hash";
import { updateVerifiedSpecs, parseSpecTable } from "../utils/spec-updater";
const BASE_URL = "https://www.fs.com"; const BASE_URL = "https://www.fs.com";
@ -270,6 +271,141 @@ export async function scrapeFs(): Promise<void> {
} }
console.log(`\nDatabase: ${written} new, ${skipped} unchanged (${uniqueProducts.size} unique)`); console.log(`\nDatabase: ${written} new, ${skipped} unchanged (${uniqueProducts.size} unique)`);
// ═══ PHASE 2: Crawl product detail pages for VERIFIED specs ═══
console.log("\n=== Phase 2: Scraping product detail pages for verified specs ===");
// Get products that need spec verification (enriched_estimated or missing key fields)
const needsSpecs = await pool.query(
`SELECT t.id, t.part_number, t.slug FROM transceivers t
JOIN vendors v ON t.vendor_id = v.id
WHERE v.slug = 'fs-com'
AND (t.data_confidence = 'enriched_estimated' OR t.data_confidence = 'unknown'
OR t.connector IS NULL OR t.connector = '' OR t.connector = '-'
OR t.wavelengths IS NULL OR t.wavelengths = ''
OR t.fiber_type IS NULL OR t.fiber_type = '')
LIMIT 200`
);
console.log(`Products needing spec verification: ${needsSpecs.rows.length}`);
// Build a map of product URLs from our scraped data
const productUrls = new Map<string, string>(); // transceiver_id → product URL
for (const p of uniqueProducts.values()) {
// Find the transceiver in DB by part number
const match = await pool.query(
`SELECT id FROM transceivers WHERE part_number = $1 AND vendor_id = $2`,
[p.partNumber, vendorId]
).catch(() => ({ rows: [] }));
if (match.rows[0] && p.url) {
productUrls.set(match.rows[0].id, p.url);
}
}
let specsUpdated = 0;
const specCrawler = new PlaywrightCrawler({
maxConcurrency: 1,
maxRequestsPerMinute: 10,
requestHandlerTimeoutSecs: 45,
headless: true,
launchContext: {
launchOptions: {
args: ["--disable-blink-features=AutomationControlled", "--lang=en-US"],
},
},
preNavigationHooks: [
async ({ page }) => {
await page.context().addCookies([
{ name: "currency", value: "USD", domain: ".fs.com", path: "/" },
{ name: "lang", value: "en", domain: ".fs.com", path: "/" },
]);
},
],
async requestHandler({ page, request, log }) {
const transceiverId = request.userData?.transceiverId;
if (!transceiverId) return;
log.info(`Spec scrape: ${request.url}`);
await page.waitForTimeout(3000);
// Extract spec table from product detail page
const specData = await page.evaluate(() => {
const specs: Record<string, string> = {};
// fs.com uses various spec table formats
const rows = document.querySelectorAll(
".product-param tr, .product-specs tr, table.param-table tr, " +
".specifications tr, .detail-param tr, .prod-spec-list tr, " +
'[class*="specification"] tr, [class*="param"] tr'
);
for (const row of rows) {
const cells = row.querySelectorAll("td, th");
if (cells.length >= 2) {
const key = (cells[0]?.textContent || "").trim();
const val = (cells[1]?.textContent || "").trim();
if (key && val && key.length < 100) specs[key] = val;
}
}
// Also try dl/dt/dd pattern
const dts = document.querySelectorAll("dt, .spec-label, .param-label");
for (const dt of dts) {
const dd = dt.nextElementSibling;
if (dd && (dd.tagName === "DD" || dd.classList.contains("spec-value") || dd.classList.contains("param-value"))) {
const key = (dt.textContent || "").trim();
const val = (dd.textContent || "").trim();
if (key && val) specs[key] = val;
}
}
// Extract image
const img = document.querySelector('.product-image img, .prod-img img, [class*="gallery"] img, .product-detail img');
const imageUrl = img?.getAttribute("src") || "";
// Extract datasheet link
const dsLink = document.querySelector('a[href*="datasheet"], a[href*=".pdf"]');
const datasheetUrl = dsLink?.getAttribute("href") || "";
return { specs, imageUrl, datasheetUrl };
});
if (Object.keys(specData.specs).length > 0) {
const parsed = parseSpecTable(specData.specs);
const updated = await updateVerifiedSpecs({
transceiverId,
fiberType: parsed.fiberType,
connector: parsed.connector,
wavelengths: parsed.wavelengths,
reachMeters: parsed.reachMeters,
reachLabel: parsed.reachLabel,
powerConsumptionW: parsed.powerConsumptionW,
tempRange: parsed.tempRange,
modulation: parsed.modulation,
domSupport: parsed.domSupport,
imageUrl: specData.imageUrl ? (specData.imageUrl.startsWith("http") ? specData.imageUrl : `${BASE_URL}${specData.imageUrl}`) : undefined,
datasheetUrl: specData.datasheetUrl ? (specData.datasheetUrl.startsWith("http") ? specData.datasheetUrl : `${BASE_URL}${specData.datasheetUrl}`) : undefined,
source: "fs.com",
});
if (updated) specsUpdated++;
}
},
});
// Build spec crawl requests (limit to 200 per run to avoid rate limiting)
const specRequests = needsSpecs.rows
.filter(r => productUrls.has(r.id))
.slice(0, 200)
.map(r => ({
url: productUrls.get(r.id)!,
userData: { transceiverId: r.id },
}));
if (specRequests.length > 0) {
console.log(`Crawling ${specRequests.length} product detail pages for specs...`);
await specCrawler.run(specRequests);
console.log(`Specs verified: ${specsUpdated} products updated`);
} else {
console.log("No product URLs available for spec verification this run");
}
console.log("=== FS.com Scraper Complete ===\n"); console.log("=== FS.com Scraper Complete ===\n");
} }

View File

@ -0,0 +1,186 @@
/**
* Update transceiver specs with verified data from vendor product pages.
* Only updates fields that are currently empty/estimated, preserving vendor-verified data.
* Marks updated products as 'scraped_unverified' (higher confidence than 'enriched_estimated').
*/
import { pool } from "./db";
export interface VerifiedSpecs {
transceiverId: string;
fiberType?: string; // SMF, MMF, Copper, AOC
connector?: string; // LC, SC, MPO-12, MPO-16, RJ45, DAC, AOC
wavelengths?: string; // "850nm", "1310nm", "1310nm (4λ CWDM)", etc.
reachMeters?: number;
reachLabel?: string;
powerConsumptionW?: number;
tempRange?: string; // COM, IND
modulation?: string; // NRZ, PAM4
domSupport?: boolean;
imageUrl?: string;
datasheetUrl?: string;
source: string; // "fs.com", "flexoptix.net", etc.
}
/**
* Update transceiver with verified specs from a vendor product page.
* Sets data_confidence to 'scraped_unverified' (better than 'enriched_estimated').
*/
export async function updateVerifiedSpecs(specs: VerifiedSpecs): Promise<boolean> {
const updates: string[] = [];
const values: any[] = [];
let idx = 1;
// Only update fields that have a new value
if (specs.fiberType) {
updates.push(`fiber_type = $${idx}`);
values.push(specs.fiberType);
idx++;
}
if (specs.connector) {
updates.push(`connector = $${idx}`);
values.push(specs.connector);
idx++;
}
if (specs.wavelengths) {
updates.push(`wavelengths = $${idx}`);
values.push(specs.wavelengths);
idx++;
}
if (specs.reachMeters && specs.reachMeters > 0) {
updates.push(`reach_meters = $${idx}`);
values.push(specs.reachMeters);
idx++;
}
if (specs.reachLabel) {
updates.push(`reach_label = $${idx}`);
values.push(specs.reachLabel);
idx++;
}
if (specs.powerConsumptionW && specs.powerConsumptionW > 0) {
updates.push(`power_consumption_w = $${idx}`);
values.push(specs.powerConsumptionW);
idx++;
}
if (specs.tempRange) {
updates.push(`temp_range = $${idx}`);
values.push(specs.tempRange);
idx++;
}
if (specs.modulation) {
updates.push(`modulation = $${idx}`);
values.push(specs.modulation);
idx++;
}
if (specs.domSupport !== undefined) {
updates.push(`dom_support = $${idx}`);
values.push(specs.domSupport);
idx++;
}
if (specs.imageUrl) {
updates.push(`image_url = $${idx}, has_image = true`);
values.push(specs.imageUrl);
idx++;
}
if (specs.datasheetUrl) {
// Use the correct column name based on schema
updates.push(`datasheet_r2_key = $${idx}`);
values.push(specs.datasheetUrl);
idx++;
}
if (updates.length === 0) return false;
// Always upgrade confidence from estimated to scraped
updates.push(`data_confidence = 'scraped_unverified'`);
updates.push(`updated_at = NOW()`);
values.push(specs.transceiverId);
await pool.query(
`UPDATE transceivers SET ${updates.join(", ")} WHERE id = $${idx}`,
values
);
return true;
}
/**
* Parse a spec table from a product page into structured data.
* Works for fs.com, 10gtek, and similar HTML spec tables.
*/
export function parseSpecTable(specs: Record<string, string>): Partial<VerifiedSpecs> {
const result: Partial<VerifiedSpecs> = {};
for (const [rawKey, rawVal] of Object.entries(specs)) {
const key = rawKey.toLowerCase().trim();
const val = rawVal.trim();
// Fiber Type
if (key.includes("fiber") && key.includes("type") || key === "cable type" || key === "media") {
if (/single.?mode|smf|os2/i.test(val)) result.fiberType = "SMF";
else if (/multi.?mode|mmf|om[1-5]/i.test(val)) result.fiberType = "MMF";
else if (/copper|cat[56]/i.test(val)) result.fiberType = "Copper";
else if (/aoc|active.optical/i.test(val)) result.fiberType = "AOC";
}
// Connector
if (key.includes("connector") || key.includes("interface")) {
if (/duplex\s*lc|lc\s*duplex|lc\/pc|lc\/upc|lc\/apc/i.test(val)) result.connector = "LC";
else if (/\blc\b/i.test(val)) result.connector = "LC";
else if (/sc\/pc|sc\/apc|\bsc\b/i.test(val)) result.connector = "SC";
else if (/mpo-?24/i.test(val)) result.connector = "MPO-24";
else if (/mpo-?16/i.test(val)) result.connector = "MPO-16";
else if (/mpo-?12|mtp-?12|mpo\b|mtp\b/i.test(val)) result.connector = "MPO-12";
else if (/rj-?45|copper/i.test(val)) result.connector = "RJ45";
else if (/cs\b/i.test(val)) result.connector = "CS";
else if (/sn\b/i.test(val)) result.connector = "SN";
}
// Wavelength
if (key.includes("wavelength") || key.includes("laser") || key === "tx wavelength") {
const nmMatch = val.match(/([\d.]+)\s*nm/i);
if (nmMatch) result.wavelengths = nmMatch[1] + "nm";
// Check for multi-wavelength
if (/cwdm/i.test(val)) result.wavelengths = val;
if (/dwdm/i.test(val)) result.wavelengths = val;
}
// Reach / Distance
if (key.includes("distance") || key.includes("reach") || key.includes("transmission") || key === "max link length") {
const kmMatch = val.match(/([\d.]+)\s*km/i);
const mMatch = val.match(/([\d.]+)\s*m\b/i);
if (kmMatch) {
const km = parseFloat(kmMatch[1]!);
result.reachMeters = Math.round(km * 1000);
result.reachLabel = km >= 1 ? `${km}km` : `${result.reachMeters}m`;
} else if (mMatch) {
result.reachMeters = parseInt(mMatch[1]!);
result.reachLabel = `${result.reachMeters}m`;
}
}
// Power Consumption
if (key.includes("power") && (key.includes("consumption") || key.includes("dissipation") || key.includes("max"))) {
const wMatch = val.match(/([\d.]+)\s*w/i);
if (wMatch) result.powerConsumptionW = parseFloat(wMatch[1]!);
}
// Temperature
if (key.includes("temperature") || key.includes("temp") && key.includes("range")) {
if (/0.*70|commercial/i.test(val)) result.tempRange = "COM";
else if (/-40.*85|industrial/i.test(val)) result.tempRange = "IND";
}
// DOM
if (key.includes("dom") || key.includes("ddm") || key.includes("diagnostic")) {
result.domSupport = /yes|supported|ddm|dom/i.test(val);
}
// Modulation
if (key.includes("modulation") || key.includes("encoding")) {
if (/pam4|pam-4/i.test(val)) result.modulation = "PAM4";
else if (/nrz/i.test(val)) result.modulation = "NRZ";
}
}
return result;
}