feat: fs.com scraper Phase 2 — crawl product detail pages for verified specs
- New spec-updater utility: parseSpecTable() + updateVerifiedSpecs() - fs.com scraper now has 2 phases: Phase 1: Category pages → prices + stock (existing) Phase 2: Product detail pages → fiber_type, connector, wavelength, power, image, datasheet - Updates data_confidence from 'enriched_estimated' to 'scraped_unverified' - Processes up to 200 product pages per scraper run
This commit is contained in:
parent
eec42e4818
commit
e4c89de6c0
@ -9,6 +9,7 @@
|
|||||||
import { PlaywrightCrawler } from "crawlee";
|
import { PlaywrightCrawler } from "crawlee";
|
||||||
import { ensureVendor, upsertPriceObservation, findOrCreateScrapedTransceiver, pool } from "../utils/db";
|
import { ensureVendor, upsertPriceObservation, findOrCreateScrapedTransceiver, pool } from "../utils/db";
|
||||||
import { contentHash, parsePrice, parseStockLevel, parseQuantity } from "../utils/hash";
|
import { contentHash, parsePrice, parseStockLevel, parseQuantity } from "../utils/hash";
|
||||||
|
import { updateVerifiedSpecs, parseSpecTable } from "../utils/spec-updater";
|
||||||
|
|
||||||
const BASE_URL = "https://www.fs.com";
|
const BASE_URL = "https://www.fs.com";
|
||||||
|
|
||||||
@ -270,6 +271,141 @@ export async function scrapeFs(): Promise<void> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
console.log(`\nDatabase: ${written} new, ${skipped} unchanged (${uniqueProducts.size} unique)`);
|
console.log(`\nDatabase: ${written} new, ${skipped} unchanged (${uniqueProducts.size} unique)`);
|
||||||
|
|
||||||
|
// ═══ PHASE 2: Crawl product detail pages for VERIFIED specs ═══
|
||||||
|
console.log("\n=== Phase 2: Scraping product detail pages for verified specs ===");
|
||||||
|
|
||||||
|
// Get products that need spec verification (enriched_estimated or missing key fields)
|
||||||
|
const needsSpecs = await pool.query(
|
||||||
|
`SELECT t.id, t.part_number, t.slug FROM transceivers t
|
||||||
|
JOIN vendors v ON t.vendor_id = v.id
|
||||||
|
WHERE v.slug = 'fs-com'
|
||||||
|
AND (t.data_confidence = 'enriched_estimated' OR t.data_confidence = 'unknown'
|
||||||
|
OR t.connector IS NULL OR t.connector = '' OR t.connector = '-'
|
||||||
|
OR t.wavelengths IS NULL OR t.wavelengths = ''
|
||||||
|
OR t.fiber_type IS NULL OR t.fiber_type = '')
|
||||||
|
LIMIT 200`
|
||||||
|
);
|
||||||
|
console.log(`Products needing spec verification: ${needsSpecs.rows.length}`);
|
||||||
|
|
||||||
|
// Build a map of product URLs from our scraped data
|
||||||
|
const productUrls = new Map<string, string>(); // transceiver_id → product URL
|
||||||
|
for (const p of uniqueProducts.values()) {
|
||||||
|
// Find the transceiver in DB by part number
|
||||||
|
const match = await pool.query(
|
||||||
|
`SELECT id FROM transceivers WHERE part_number = $1 AND vendor_id = $2`,
|
||||||
|
[p.partNumber, vendorId]
|
||||||
|
).catch(() => ({ rows: [] }));
|
||||||
|
if (match.rows[0] && p.url) {
|
||||||
|
productUrls.set(match.rows[0].id, p.url);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let specsUpdated = 0;
|
||||||
|
const specCrawler = new PlaywrightCrawler({
|
||||||
|
maxConcurrency: 1,
|
||||||
|
maxRequestsPerMinute: 10,
|
||||||
|
requestHandlerTimeoutSecs: 45,
|
||||||
|
headless: true,
|
||||||
|
launchContext: {
|
||||||
|
launchOptions: {
|
||||||
|
args: ["--disable-blink-features=AutomationControlled", "--lang=en-US"],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
preNavigationHooks: [
|
||||||
|
async ({ page }) => {
|
||||||
|
await page.context().addCookies([
|
||||||
|
{ name: "currency", value: "USD", domain: ".fs.com", path: "/" },
|
||||||
|
{ name: "lang", value: "en", domain: ".fs.com", path: "/" },
|
||||||
|
]);
|
||||||
|
},
|
||||||
|
],
|
||||||
|
async requestHandler({ page, request, log }) {
|
||||||
|
const transceiverId = request.userData?.transceiverId;
|
||||||
|
if (!transceiverId) return;
|
||||||
|
|
||||||
|
log.info(`Spec scrape: ${request.url}`);
|
||||||
|
await page.waitForTimeout(3000);
|
||||||
|
|
||||||
|
// Extract spec table from product detail page
|
||||||
|
const specData = await page.evaluate(() => {
|
||||||
|
const specs: Record<string, string> = {};
|
||||||
|
// fs.com uses various spec table formats
|
||||||
|
const rows = document.querySelectorAll(
|
||||||
|
".product-param tr, .product-specs tr, table.param-table tr, " +
|
||||||
|
".specifications tr, .detail-param tr, .prod-spec-list tr, " +
|
||||||
|
'[class*="specification"] tr, [class*="param"] tr'
|
||||||
|
);
|
||||||
|
for (const row of rows) {
|
||||||
|
const cells = row.querySelectorAll("td, th");
|
||||||
|
if (cells.length >= 2) {
|
||||||
|
const key = (cells[0]?.textContent || "").trim();
|
||||||
|
const val = (cells[1]?.textContent || "").trim();
|
||||||
|
if (key && val && key.length < 100) specs[key] = val;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Also try dl/dt/dd pattern
|
||||||
|
const dts = document.querySelectorAll("dt, .spec-label, .param-label");
|
||||||
|
for (const dt of dts) {
|
||||||
|
const dd = dt.nextElementSibling;
|
||||||
|
if (dd && (dd.tagName === "DD" || dd.classList.contains("spec-value") || dd.classList.contains("param-value"))) {
|
||||||
|
const key = (dt.textContent || "").trim();
|
||||||
|
const val = (dd.textContent || "").trim();
|
||||||
|
if (key && val) specs[key] = val;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract image
|
||||||
|
const img = document.querySelector('.product-image img, .prod-img img, [class*="gallery"] img, .product-detail img');
|
||||||
|
const imageUrl = img?.getAttribute("src") || "";
|
||||||
|
|
||||||
|
// Extract datasheet link
|
||||||
|
const dsLink = document.querySelector('a[href*="datasheet"], a[href*=".pdf"]');
|
||||||
|
const datasheetUrl = dsLink?.getAttribute("href") || "";
|
||||||
|
|
||||||
|
return { specs, imageUrl, datasheetUrl };
|
||||||
|
});
|
||||||
|
|
||||||
|
if (Object.keys(specData.specs).length > 0) {
|
||||||
|
const parsed = parseSpecTable(specData.specs);
|
||||||
|
const updated = await updateVerifiedSpecs({
|
||||||
|
transceiverId,
|
||||||
|
fiberType: parsed.fiberType,
|
||||||
|
connector: parsed.connector,
|
||||||
|
wavelengths: parsed.wavelengths,
|
||||||
|
reachMeters: parsed.reachMeters,
|
||||||
|
reachLabel: parsed.reachLabel,
|
||||||
|
powerConsumptionW: parsed.powerConsumptionW,
|
||||||
|
tempRange: parsed.tempRange,
|
||||||
|
modulation: parsed.modulation,
|
||||||
|
domSupport: parsed.domSupport,
|
||||||
|
imageUrl: specData.imageUrl ? (specData.imageUrl.startsWith("http") ? specData.imageUrl : `${BASE_URL}${specData.imageUrl}`) : undefined,
|
||||||
|
datasheetUrl: specData.datasheetUrl ? (specData.datasheetUrl.startsWith("http") ? specData.datasheetUrl : `${BASE_URL}${specData.datasheetUrl}`) : undefined,
|
||||||
|
source: "fs.com",
|
||||||
|
});
|
||||||
|
if (updated) specsUpdated++;
|
||||||
|
}
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
// Build spec crawl requests (limit to 200 per run to avoid rate limiting)
|
||||||
|
const specRequests = needsSpecs.rows
|
||||||
|
.filter(r => productUrls.has(r.id))
|
||||||
|
.slice(0, 200)
|
||||||
|
.map(r => ({
|
||||||
|
url: productUrls.get(r.id)!,
|
||||||
|
userData: { transceiverId: r.id },
|
||||||
|
}));
|
||||||
|
|
||||||
|
if (specRequests.length > 0) {
|
||||||
|
console.log(`Crawling ${specRequests.length} product detail pages for specs...`);
|
||||||
|
await specCrawler.run(specRequests);
|
||||||
|
console.log(`Specs verified: ${specsUpdated} products updated`);
|
||||||
|
} else {
|
||||||
|
console.log("No product URLs available for spec verification this run");
|
||||||
|
}
|
||||||
|
|
||||||
console.log("=== FS.com Scraper Complete ===\n");
|
console.log("=== FS.com Scraper Complete ===\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
186
packages/scraper/src/utils/spec-updater.ts
Normal file
186
packages/scraper/src/utils/spec-updater.ts
Normal file
@ -0,0 +1,186 @@
|
|||||||
|
/**
|
||||||
|
* Update transceiver specs with verified data from vendor product pages.
|
||||||
|
* Only updates fields that are currently empty/estimated, preserving vendor-verified data.
|
||||||
|
* Marks updated products as 'scraped_unverified' (higher confidence than 'enriched_estimated').
|
||||||
|
*/
|
||||||
|
import { pool } from "./db";
|
||||||
|
|
||||||
|
export interface VerifiedSpecs {
|
||||||
|
transceiverId: string;
|
||||||
|
fiberType?: string; // SMF, MMF, Copper, AOC
|
||||||
|
connector?: string; // LC, SC, MPO-12, MPO-16, RJ45, DAC, AOC
|
||||||
|
wavelengths?: string; // "850nm", "1310nm", "1310nm (4λ CWDM)", etc.
|
||||||
|
reachMeters?: number;
|
||||||
|
reachLabel?: string;
|
||||||
|
powerConsumptionW?: number;
|
||||||
|
tempRange?: string; // COM, IND
|
||||||
|
modulation?: string; // NRZ, PAM4
|
||||||
|
domSupport?: boolean;
|
||||||
|
imageUrl?: string;
|
||||||
|
datasheetUrl?: string;
|
||||||
|
source: string; // "fs.com", "flexoptix.net", etc.
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Update transceiver with verified specs from a vendor product page.
|
||||||
|
* Sets data_confidence to 'scraped_unverified' (better than 'enriched_estimated').
|
||||||
|
*/
|
||||||
|
export async function updateVerifiedSpecs(specs: VerifiedSpecs): Promise<boolean> {
|
||||||
|
const updates: string[] = [];
|
||||||
|
const values: any[] = [];
|
||||||
|
let idx = 1;
|
||||||
|
|
||||||
|
// Only update fields that have a new value
|
||||||
|
if (specs.fiberType) {
|
||||||
|
updates.push(`fiber_type = $${idx}`);
|
||||||
|
values.push(specs.fiberType);
|
||||||
|
idx++;
|
||||||
|
}
|
||||||
|
if (specs.connector) {
|
||||||
|
updates.push(`connector = $${idx}`);
|
||||||
|
values.push(specs.connector);
|
||||||
|
idx++;
|
||||||
|
}
|
||||||
|
if (specs.wavelengths) {
|
||||||
|
updates.push(`wavelengths = $${idx}`);
|
||||||
|
values.push(specs.wavelengths);
|
||||||
|
idx++;
|
||||||
|
}
|
||||||
|
if (specs.reachMeters && specs.reachMeters > 0) {
|
||||||
|
updates.push(`reach_meters = $${idx}`);
|
||||||
|
values.push(specs.reachMeters);
|
||||||
|
idx++;
|
||||||
|
}
|
||||||
|
if (specs.reachLabel) {
|
||||||
|
updates.push(`reach_label = $${idx}`);
|
||||||
|
values.push(specs.reachLabel);
|
||||||
|
idx++;
|
||||||
|
}
|
||||||
|
if (specs.powerConsumptionW && specs.powerConsumptionW > 0) {
|
||||||
|
updates.push(`power_consumption_w = $${idx}`);
|
||||||
|
values.push(specs.powerConsumptionW);
|
||||||
|
idx++;
|
||||||
|
}
|
||||||
|
if (specs.tempRange) {
|
||||||
|
updates.push(`temp_range = $${idx}`);
|
||||||
|
values.push(specs.tempRange);
|
||||||
|
idx++;
|
||||||
|
}
|
||||||
|
if (specs.modulation) {
|
||||||
|
updates.push(`modulation = $${idx}`);
|
||||||
|
values.push(specs.modulation);
|
||||||
|
idx++;
|
||||||
|
}
|
||||||
|
if (specs.domSupport !== undefined) {
|
||||||
|
updates.push(`dom_support = $${idx}`);
|
||||||
|
values.push(specs.domSupport);
|
||||||
|
idx++;
|
||||||
|
}
|
||||||
|
if (specs.imageUrl) {
|
||||||
|
updates.push(`image_url = $${idx}, has_image = true`);
|
||||||
|
values.push(specs.imageUrl);
|
||||||
|
idx++;
|
||||||
|
}
|
||||||
|
if (specs.datasheetUrl) {
|
||||||
|
// Use the correct column name based on schema
|
||||||
|
updates.push(`datasheet_r2_key = $${idx}`);
|
||||||
|
values.push(specs.datasheetUrl);
|
||||||
|
idx++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (updates.length === 0) return false;
|
||||||
|
|
||||||
|
// Always upgrade confidence from estimated to scraped
|
||||||
|
updates.push(`data_confidence = 'scraped_unverified'`);
|
||||||
|
updates.push(`updated_at = NOW()`);
|
||||||
|
|
||||||
|
values.push(specs.transceiverId);
|
||||||
|
await pool.query(
|
||||||
|
`UPDATE transceivers SET ${updates.join(", ")} WHERE id = $${idx}`,
|
||||||
|
values
|
||||||
|
);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parse a spec table from a product page into structured data.
|
||||||
|
* Works for fs.com, 10gtek, and similar HTML spec tables.
|
||||||
|
*/
|
||||||
|
export function parseSpecTable(specs: Record<string, string>): Partial<VerifiedSpecs> {
|
||||||
|
const result: Partial<VerifiedSpecs> = {};
|
||||||
|
|
||||||
|
for (const [rawKey, rawVal] of Object.entries(specs)) {
|
||||||
|
const key = rawKey.toLowerCase().trim();
|
||||||
|
const val = rawVal.trim();
|
||||||
|
|
||||||
|
// Fiber Type
|
||||||
|
if (key.includes("fiber") && key.includes("type") || key === "cable type" || key === "media") {
|
||||||
|
if (/single.?mode|smf|os2/i.test(val)) result.fiberType = "SMF";
|
||||||
|
else if (/multi.?mode|mmf|om[1-5]/i.test(val)) result.fiberType = "MMF";
|
||||||
|
else if (/copper|cat[56]/i.test(val)) result.fiberType = "Copper";
|
||||||
|
else if (/aoc|active.optical/i.test(val)) result.fiberType = "AOC";
|
||||||
|
}
|
||||||
|
|
||||||
|
// Connector
|
||||||
|
if (key.includes("connector") || key.includes("interface")) {
|
||||||
|
if (/duplex\s*lc|lc\s*duplex|lc\/pc|lc\/upc|lc\/apc/i.test(val)) result.connector = "LC";
|
||||||
|
else if (/\blc\b/i.test(val)) result.connector = "LC";
|
||||||
|
else if (/sc\/pc|sc\/apc|\bsc\b/i.test(val)) result.connector = "SC";
|
||||||
|
else if (/mpo-?24/i.test(val)) result.connector = "MPO-24";
|
||||||
|
else if (/mpo-?16/i.test(val)) result.connector = "MPO-16";
|
||||||
|
else if (/mpo-?12|mtp-?12|mpo\b|mtp\b/i.test(val)) result.connector = "MPO-12";
|
||||||
|
else if (/rj-?45|copper/i.test(val)) result.connector = "RJ45";
|
||||||
|
else if (/cs\b/i.test(val)) result.connector = "CS";
|
||||||
|
else if (/sn\b/i.test(val)) result.connector = "SN";
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wavelength
|
||||||
|
if (key.includes("wavelength") || key.includes("laser") || key === "tx wavelength") {
|
||||||
|
const nmMatch = val.match(/([\d.]+)\s*nm/i);
|
||||||
|
if (nmMatch) result.wavelengths = nmMatch[1] + "nm";
|
||||||
|
// Check for multi-wavelength
|
||||||
|
if (/cwdm/i.test(val)) result.wavelengths = val;
|
||||||
|
if (/dwdm/i.test(val)) result.wavelengths = val;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reach / Distance
|
||||||
|
if (key.includes("distance") || key.includes("reach") || key.includes("transmission") || key === "max link length") {
|
||||||
|
const kmMatch = val.match(/([\d.]+)\s*km/i);
|
||||||
|
const mMatch = val.match(/([\d.]+)\s*m\b/i);
|
||||||
|
if (kmMatch) {
|
||||||
|
const km = parseFloat(kmMatch[1]!);
|
||||||
|
result.reachMeters = Math.round(km * 1000);
|
||||||
|
result.reachLabel = km >= 1 ? `${km}km` : `${result.reachMeters}m`;
|
||||||
|
} else if (mMatch) {
|
||||||
|
result.reachMeters = parseInt(mMatch[1]!);
|
||||||
|
result.reachLabel = `${result.reachMeters}m`;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Power Consumption
|
||||||
|
if (key.includes("power") && (key.includes("consumption") || key.includes("dissipation") || key.includes("max"))) {
|
||||||
|
const wMatch = val.match(/([\d.]+)\s*w/i);
|
||||||
|
if (wMatch) result.powerConsumptionW = parseFloat(wMatch[1]!);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Temperature
|
||||||
|
if (key.includes("temperature") || key.includes("temp") && key.includes("range")) {
|
||||||
|
if (/0.*70|commercial/i.test(val)) result.tempRange = "COM";
|
||||||
|
else if (/-40.*85|industrial/i.test(val)) result.tempRange = "IND";
|
||||||
|
}
|
||||||
|
|
||||||
|
// DOM
|
||||||
|
if (key.includes("dom") || key.includes("ddm") || key.includes("diagnostic")) {
|
||||||
|
result.domSupport = /yes|supported|ddm|dom/i.test(val);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Modulation
|
||||||
|
if (key.includes("modulation") || key.includes("encoding")) {
|
||||||
|
if (/pam4|pam-4/i.test(val)) result.modulation = "PAM4";
|
||||||
|
else if (/nrz/i.test(val)) result.modulation = "NRZ";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
Loading…
x
Reference in New Issue
Block a user