diff --git a/packages/scraper/src/scrapers/fs-com.ts b/packages/scraper/src/scrapers/fs-com.ts index 87a1d4f..c503811 100644 --- a/packages/scraper/src/scrapers/fs-com.ts +++ b/packages/scraper/src/scrapers/fs-com.ts @@ -9,6 +9,7 @@ import { PlaywrightCrawler } from "crawlee"; import { ensureVendor, upsertPriceObservation, findOrCreateScrapedTransceiver, pool } from "../utils/db"; import { contentHash, parsePrice, parseStockLevel, parseQuantity } from "../utils/hash"; +import { updateVerifiedSpecs, parseSpecTable } from "../utils/spec-updater"; const BASE_URL = "https://www.fs.com"; @@ -270,6 +271,141 @@ export async function scrapeFs(): Promise { } console.log(`\nDatabase: ${written} new, ${skipped} unchanged (${uniqueProducts.size} unique)`); + + // ═══ PHASE 2: Crawl product detail pages for VERIFIED specs ═══ + console.log("\n=== Phase 2: Scraping product detail pages for verified specs ==="); + + // Get products that need spec verification (enriched_estimated or missing key fields) + const needsSpecs = await pool.query( + `SELECT t.id, t.part_number, t.slug FROM transceivers t + JOIN vendors v ON t.vendor_id = v.id + WHERE v.slug = 'fs-com' + AND (t.data_confidence = 'enriched_estimated' OR t.data_confidence = 'unknown' + OR t.connector IS NULL OR t.connector = '' OR t.connector = '-' + OR t.wavelengths IS NULL OR t.wavelengths = '' + OR t.fiber_type IS NULL OR t.fiber_type = '') + LIMIT 200` + ); + console.log(`Products needing spec verification: ${needsSpecs.rows.length}`); + + // Build a map of product URLs from our scraped data + const productUrls = new Map(); // transceiver_id → product URL + for (const p of uniqueProducts.values()) { + // Find the transceiver in DB by part number + const match = await pool.query( + `SELECT id FROM transceivers WHERE part_number = $1 AND vendor_id = $2`, + [p.partNumber, vendorId] + ).catch(() => ({ rows: [] })); + if (match.rows[0] && p.url) { + productUrls.set(match.rows[0].id, p.url); + } + } + + let specsUpdated = 0; + const specCrawler = new PlaywrightCrawler({ + maxConcurrency: 1, + maxRequestsPerMinute: 10, + requestHandlerTimeoutSecs: 45, + headless: true, + launchContext: { + launchOptions: { + args: ["--disable-blink-features=AutomationControlled", "--lang=en-US"], + }, + }, + preNavigationHooks: [ + async ({ page }) => { + await page.context().addCookies([ + { name: "currency", value: "USD", domain: ".fs.com", path: "/" }, + { name: "lang", value: "en", domain: ".fs.com", path: "/" }, + ]); + }, + ], + async requestHandler({ page, request, log }) { + const transceiverId = request.userData?.transceiverId; + if (!transceiverId) return; + + log.info(`Spec scrape: ${request.url}`); + await page.waitForTimeout(3000); + + // Extract spec table from product detail page + const specData = await page.evaluate(() => { + const specs: Record = {}; + // fs.com uses various spec table formats + const rows = document.querySelectorAll( + ".product-param tr, .product-specs tr, table.param-table tr, " + + ".specifications tr, .detail-param tr, .prod-spec-list tr, " + + '[class*="specification"] tr, [class*="param"] tr' + ); + for (const row of rows) { + const cells = row.querySelectorAll("td, th"); + if (cells.length >= 2) { + const key = (cells[0]?.textContent || "").trim(); + const val = (cells[1]?.textContent || "").trim(); + if (key && val && key.length < 100) specs[key] = val; + } + } + + // Also try dl/dt/dd pattern + const dts = document.querySelectorAll("dt, .spec-label, .param-label"); + for (const dt of dts) { + const dd = dt.nextElementSibling; + if (dd && (dd.tagName === "DD" || dd.classList.contains("spec-value") || dd.classList.contains("param-value"))) { + const key = (dt.textContent || "").trim(); + const val = (dd.textContent || "").trim(); + if (key && val) specs[key] = val; + } + } + + // Extract image + const img = document.querySelector('.product-image img, .prod-img img, [class*="gallery"] img, .product-detail img'); + const imageUrl = img?.getAttribute("src") || ""; + + // Extract datasheet link + const dsLink = document.querySelector('a[href*="datasheet"], a[href*=".pdf"]'); + const datasheetUrl = dsLink?.getAttribute("href") || ""; + + return { specs, imageUrl, datasheetUrl }; + }); + + if (Object.keys(specData.specs).length > 0) { + const parsed = parseSpecTable(specData.specs); + const updated = await updateVerifiedSpecs({ + transceiverId, + fiberType: parsed.fiberType, + connector: parsed.connector, + wavelengths: parsed.wavelengths, + reachMeters: parsed.reachMeters, + reachLabel: parsed.reachLabel, + powerConsumptionW: parsed.powerConsumptionW, + tempRange: parsed.tempRange, + modulation: parsed.modulation, + domSupport: parsed.domSupport, + imageUrl: specData.imageUrl ? (specData.imageUrl.startsWith("http") ? specData.imageUrl : `${BASE_URL}${specData.imageUrl}`) : undefined, + datasheetUrl: specData.datasheetUrl ? (specData.datasheetUrl.startsWith("http") ? specData.datasheetUrl : `${BASE_URL}${specData.datasheetUrl}`) : undefined, + source: "fs.com", + }); + if (updated) specsUpdated++; + } + }, + }); + + // Build spec crawl requests (limit to 200 per run to avoid rate limiting) + const specRequests = needsSpecs.rows + .filter(r => productUrls.has(r.id)) + .slice(0, 200) + .map(r => ({ + url: productUrls.get(r.id)!, + userData: { transceiverId: r.id }, + })); + + if (specRequests.length > 0) { + console.log(`Crawling ${specRequests.length} product detail pages for specs...`); + await specCrawler.run(specRequests); + console.log(`Specs verified: ${specsUpdated} products updated`); + } else { + console.log("No product URLs available for spec verification this run"); + } + console.log("=== FS.com Scraper Complete ===\n"); } diff --git a/packages/scraper/src/utils/spec-updater.ts b/packages/scraper/src/utils/spec-updater.ts new file mode 100644 index 0000000..9b16b6e --- /dev/null +++ b/packages/scraper/src/utils/spec-updater.ts @@ -0,0 +1,186 @@ +/** + * Update transceiver specs with verified data from vendor product pages. + * Only updates fields that are currently empty/estimated, preserving vendor-verified data. + * Marks updated products as 'scraped_unverified' (higher confidence than 'enriched_estimated'). + */ +import { pool } from "./db"; + +export interface VerifiedSpecs { + transceiverId: string; + fiberType?: string; // SMF, MMF, Copper, AOC + connector?: string; // LC, SC, MPO-12, MPO-16, RJ45, DAC, AOC + wavelengths?: string; // "850nm", "1310nm", "1310nm (4λ CWDM)", etc. + reachMeters?: number; + reachLabel?: string; + powerConsumptionW?: number; + tempRange?: string; // COM, IND + modulation?: string; // NRZ, PAM4 + domSupport?: boolean; + imageUrl?: string; + datasheetUrl?: string; + source: string; // "fs.com", "flexoptix.net", etc. +} + +/** + * Update transceiver with verified specs from a vendor product page. + * Sets data_confidence to 'scraped_unverified' (better than 'enriched_estimated'). + */ +export async function updateVerifiedSpecs(specs: VerifiedSpecs): Promise { + const updates: string[] = []; + const values: any[] = []; + let idx = 1; + + // Only update fields that have a new value + if (specs.fiberType) { + updates.push(`fiber_type = $${idx}`); + values.push(specs.fiberType); + idx++; + } + if (specs.connector) { + updates.push(`connector = $${idx}`); + values.push(specs.connector); + idx++; + } + if (specs.wavelengths) { + updates.push(`wavelengths = $${idx}`); + values.push(specs.wavelengths); + idx++; + } + if (specs.reachMeters && specs.reachMeters > 0) { + updates.push(`reach_meters = $${idx}`); + values.push(specs.reachMeters); + idx++; + } + if (specs.reachLabel) { + updates.push(`reach_label = $${idx}`); + values.push(specs.reachLabel); + idx++; + } + if (specs.powerConsumptionW && specs.powerConsumptionW > 0) { + updates.push(`power_consumption_w = $${idx}`); + values.push(specs.powerConsumptionW); + idx++; + } + if (specs.tempRange) { + updates.push(`temp_range = $${idx}`); + values.push(specs.tempRange); + idx++; + } + if (specs.modulation) { + updates.push(`modulation = $${idx}`); + values.push(specs.modulation); + idx++; + } + if (specs.domSupport !== undefined) { + updates.push(`dom_support = $${idx}`); + values.push(specs.domSupport); + idx++; + } + if (specs.imageUrl) { + updates.push(`image_url = $${idx}, has_image = true`); + values.push(specs.imageUrl); + idx++; + } + if (specs.datasheetUrl) { + // Use the correct column name based on schema + updates.push(`datasheet_r2_key = $${idx}`); + values.push(specs.datasheetUrl); + idx++; + } + + if (updates.length === 0) return false; + + // Always upgrade confidence from estimated to scraped + updates.push(`data_confidence = 'scraped_unverified'`); + updates.push(`updated_at = NOW()`); + + values.push(specs.transceiverId); + await pool.query( + `UPDATE transceivers SET ${updates.join(", ")} WHERE id = $${idx}`, + values + ); + + return true; +} + +/** + * Parse a spec table from a product page into structured data. + * Works for fs.com, 10gtek, and similar HTML spec tables. + */ +export function parseSpecTable(specs: Record): Partial { + const result: Partial = {}; + + for (const [rawKey, rawVal] of Object.entries(specs)) { + const key = rawKey.toLowerCase().trim(); + const val = rawVal.trim(); + + // Fiber Type + if (key.includes("fiber") && key.includes("type") || key === "cable type" || key === "media") { + if (/single.?mode|smf|os2/i.test(val)) result.fiberType = "SMF"; + else if (/multi.?mode|mmf|om[1-5]/i.test(val)) result.fiberType = "MMF"; + else if (/copper|cat[56]/i.test(val)) result.fiberType = "Copper"; + else if (/aoc|active.optical/i.test(val)) result.fiberType = "AOC"; + } + + // Connector + if (key.includes("connector") || key.includes("interface")) { + if (/duplex\s*lc|lc\s*duplex|lc\/pc|lc\/upc|lc\/apc/i.test(val)) result.connector = "LC"; + else if (/\blc\b/i.test(val)) result.connector = "LC"; + else if (/sc\/pc|sc\/apc|\bsc\b/i.test(val)) result.connector = "SC"; + else if (/mpo-?24/i.test(val)) result.connector = "MPO-24"; + else if (/mpo-?16/i.test(val)) result.connector = "MPO-16"; + else if (/mpo-?12|mtp-?12|mpo\b|mtp\b/i.test(val)) result.connector = "MPO-12"; + else if (/rj-?45|copper/i.test(val)) result.connector = "RJ45"; + else if (/cs\b/i.test(val)) result.connector = "CS"; + else if (/sn\b/i.test(val)) result.connector = "SN"; + } + + // Wavelength + if (key.includes("wavelength") || key.includes("laser") || key === "tx wavelength") { + const nmMatch = val.match(/([\d.]+)\s*nm/i); + if (nmMatch) result.wavelengths = nmMatch[1] + "nm"; + // Check for multi-wavelength + if (/cwdm/i.test(val)) result.wavelengths = val; + if (/dwdm/i.test(val)) result.wavelengths = val; + } + + // Reach / Distance + if (key.includes("distance") || key.includes("reach") || key.includes("transmission") || key === "max link length") { + const kmMatch = val.match(/([\d.]+)\s*km/i); + const mMatch = val.match(/([\d.]+)\s*m\b/i); + if (kmMatch) { + const km = parseFloat(kmMatch[1]!); + result.reachMeters = Math.round(km * 1000); + result.reachLabel = km >= 1 ? `${km}km` : `${result.reachMeters}m`; + } else if (mMatch) { + result.reachMeters = parseInt(mMatch[1]!); + result.reachLabel = `${result.reachMeters}m`; + } + } + + // Power Consumption + if (key.includes("power") && (key.includes("consumption") || key.includes("dissipation") || key.includes("max"))) { + const wMatch = val.match(/([\d.]+)\s*w/i); + if (wMatch) result.powerConsumptionW = parseFloat(wMatch[1]!); + } + + // Temperature + if (key.includes("temperature") || key.includes("temp") && key.includes("range")) { + if (/0.*70|commercial/i.test(val)) result.tempRange = "COM"; + else if (/-40.*85|industrial/i.test(val)) result.tempRange = "IND"; + } + + // DOM + if (key.includes("dom") || key.includes("ddm") || key.includes("diagnostic")) { + result.domSupport = /yes|supported|ddm|dom/i.test(val); + } + + // Modulation + if (key.includes("modulation") || key.includes("encoding")) { + if (/pam4|pam-4/i.test(val)) result.modulation = "PAM4"; + else if (/nrz/i.test(val)) result.modulation = "NRZ"; + } + } + + return result; +}