- New spec-updater utility: parseSpecTable() + updateVerifiedSpecs() - fs.com scraper now has 2 phases: Phase 1: Category pages → prices + stock (existing) Phase 2: Product detail pages → fiber_type, connector, wavelength, power, image, datasheet - Updates data_confidence from 'enriched_estimated' to 'scraped_unverified' - Processes up to 200 product pages per scraper run
421 lines
15 KiB
TypeScript
421 lines
15 KiB
TypeScript
/**
|
|
* FS.com Scraper — Prices, Stock, Product Catalog
|
|
*
|
|
* FS.com renders products client-side (JS), so we use PlaywrightCrawler.
|
|
* Categories: /c/optical-transceivers-9
|
|
*
|
|
* Respects: robots.txt, rate limiting (2s between requests)
|
|
*/
|
|
import { PlaywrightCrawler } from "crawlee";
|
|
import { ensureVendor, upsertPriceObservation, findOrCreateScrapedTransceiver, pool } from "../utils/db";
|
|
import { contentHash, parsePrice, parseStockLevel, parseQuantity } from "../utils/hash";
|
|
import { updateVerifiedSpecs, parseSpecTable } from "../utils/spec-updater";
|
|
|
|
const BASE_URL = "https://www.fs.com";
|
|
|
|
const CATEGORY_URLS = [
|
|
"/c/1g-sfp-81",
|
|
"/c/10g-sfp-63",
|
|
"/c/25g-sfp28-3215",
|
|
"/c/40g-qsfp-1360",
|
|
"/c/100g-qsfp28-sfp-dd-1159",
|
|
"/c/200g-qsfp-dd-qsfp56-3542",
|
|
"/c/400g-osfp-qsfp112-qsfp-dd-3652",
|
|
"/c/800g-osfp-qsfp-dd-4089",
|
|
"/c/1.6t-osfp-5597",
|
|
"/c/400g-coherent-qsfp-dd-4103",
|
|
"/c/10g-cwdm-dwdm-sfp-65",
|
|
"/c/100g-dwdm-qsfp28-3863",
|
|
];
|
|
|
|
interface FsProduct {
|
|
partNumber: string;
|
|
name: string;
|
|
price: number;
|
|
currency: string;
|
|
stockLevel: string;
|
|
quantity?: number;
|
|
url: string;
|
|
formFactor?: string;
|
|
speedGbps?: number;
|
|
speed?: string;
|
|
reachLabel?: string;
|
|
}
|
|
|
|
function detectFormFactor(text: string): string | undefined {
|
|
const lower = text.toLowerCase();
|
|
if (lower.includes("osfp") && !lower.includes("qsfp")) return "OSFP";
|
|
if (lower.includes("qsfp-dd800") || lower.includes("qsfp-dd 800")) return "QSFP-DD800";
|
|
if (lower.includes("qsfp-dd")) return "QSFP-DD";
|
|
if (lower.includes("qsfp56")) return "QSFP56";
|
|
if (lower.includes("qsfp28")) return "QSFP28";
|
|
if (lower.includes("qsfp+") || lower.includes("qsfp plus")) return "QSFP+";
|
|
if (lower.includes("sfp56")) return "SFP56";
|
|
if (lower.includes("sfp28")) return "SFP28";
|
|
if (lower.includes("sfp+") || lower.includes("sfp plus")) return "SFP+";
|
|
if (lower.includes("sfp") && !lower.includes("qsfp")) return "SFP";
|
|
if (lower.includes("cfp2")) return "CFP2";
|
|
if (lower.includes("xfp")) return "XFP";
|
|
return undefined;
|
|
}
|
|
|
|
function detectSpeed(text: string): { speed: string; speedGbps: number } | undefined {
|
|
const patterns: [RegExp, string, number][] = [
|
|
[/800\s*g/i, "800G", 800],
|
|
[/400\s*g/i, "400G", 400],
|
|
[/200\s*g/i, "200G", 200],
|
|
[/100\s*g/i, "100G", 100],
|
|
[/50\s*g/i, "50G", 50],
|
|
[/40\s*g/i, "40G", 40],
|
|
[/25\s*g/i, "25G", 25],
|
|
[/10\s*g/i, "10G", 10],
|
|
[/1\s*g\b/i, "1G", 1],
|
|
];
|
|
for (const [re, speed, gbps] of patterns) {
|
|
if (re.test(text)) return { speed, speedGbps: gbps };
|
|
}
|
|
return undefined;
|
|
}
|
|
|
|
function detectReach(text: string): string | undefined {
|
|
const match = text.match(/(\d+)\s*(m|km)\b/i);
|
|
if (match) return `${match[1]}${match[2].toLowerCase()}`;
|
|
return undefined;
|
|
}
|
|
|
|
export async function scrapeFs(): Promise<void> {
|
|
console.log("=== FS.com Scraper Starting ===\n");
|
|
|
|
const vendorId = await ensureVendor(
|
|
"FS.COM",
|
|
"compatible",
|
|
"https://www.fs.com",
|
|
"https://www.fs.com/c/optical-transceivers-9"
|
|
);
|
|
console.log(`Vendor ID: ${vendorId}`);
|
|
|
|
const products: FsProduct[] = [];
|
|
let pagesScraped = 0;
|
|
|
|
const crawler = new PlaywrightCrawler({
|
|
maxConcurrency: 1,
|
|
maxRequestsPerMinute: 15,
|
|
requestHandlerTimeoutSecs: 60,
|
|
headless: true,
|
|
launchContext: {
|
|
launchOptions: {
|
|
args: ["--disable-blink-features=AutomationControlled", "--lang=en-US"],
|
|
},
|
|
},
|
|
|
|
preNavigationHooks: [
|
|
async ({ page }) => {
|
|
await page.setExtraHTTPHeaders({
|
|
"Accept-Language": "en-US,en;q=0.9",
|
|
});
|
|
await page.context().addCookies([
|
|
{ name: "currency", value: "USD", domain: ".fs.com", path: "/" },
|
|
{ name: "lang", value: "en", domain: ".fs.com", path: "/" },
|
|
{ name: "country", value: "US", domain: ".fs.com", path: "/" },
|
|
]);
|
|
},
|
|
],
|
|
|
|
async requestHandler({ page, request, log }) {
|
|
const url = request.url;
|
|
log.info(`Scraping: ${url}`);
|
|
|
|
// Wait for Vue.js product grid to render
|
|
await page.waitForTimeout(4000);
|
|
|
|
const productData = await page.evaluate(() => {
|
|
const results: Array<{
|
|
name: string;
|
|
href: string;
|
|
price: string;
|
|
stock: string;
|
|
partNumber: string;
|
|
}> = [];
|
|
|
|
// Strategy 1: Parse .category__grid__item cards (2026 Vue.js DOM)
|
|
const gridItems = document.querySelectorAll(".category__grid__item");
|
|
for (const item of gridItems) {
|
|
const link = item.querySelector('a[href*="/products/"]') as HTMLAnchorElement | null;
|
|
const img = item.querySelector("img");
|
|
const priceEl = item.querySelector(".grid__price");
|
|
const allText = item.textContent || "";
|
|
|
|
if (!link) continue;
|
|
|
|
const name = img?.getAttribute("alt")?.trim() || link.textContent?.trim() || "";
|
|
const href = link.getAttribute("href") || "";
|
|
const price = priceEl?.textContent?.trim() || "";
|
|
|
|
// Extract stock from text like "1914 in Global Warehouse"
|
|
const stockMatch = allText.match(/(\d+)\s+in\s+(?:Global\s+)?Warehouse/i);
|
|
const stock = stockMatch ? stockMatch[1] + " in stock" : "";
|
|
|
|
// Extract FS product ID from URL
|
|
const pnMatch = href.match(/products\/(\d+)\.html/);
|
|
const partNumber = pnMatch ? `FS-${pnMatch[1]}` : "";
|
|
|
|
if (name && href) {
|
|
results.push({ name, href, price, stock, partNumber });
|
|
}
|
|
}
|
|
|
|
// Strategy 2: Fallback — look for product links with prices nearby
|
|
if (results.length === 0) {
|
|
const productLinks = document.querySelectorAll(
|
|
'a[href*="/products/"], a[href*="/product/"]'
|
|
);
|
|
for (const link of productLinks) {
|
|
const el = link as HTMLAnchorElement;
|
|
const name = el.textContent?.trim() || "";
|
|
const href = el.getAttribute("href") || "";
|
|
if (!name || name.length < 5 || !href) continue;
|
|
|
|
const container = el.closest('[class*="product"]') || el.closest('[class*="item"]') || el.closest("li") || el.parentElement?.parentElement;
|
|
let price = "";
|
|
let stock = "";
|
|
if (container) {
|
|
const priceEl = container.querySelector('[class*="price"]');
|
|
price = priceEl?.textContent?.trim() || "";
|
|
const stockEl = container.querySelector('[class*="stock"], [class*="avail"]');
|
|
stock = stockEl?.textContent?.trim() || "";
|
|
}
|
|
const pn = href.split("/").pop()?.replace(".html", "")?.replace(/\?.*/, "") || "";
|
|
if (name) results.push({ name, href, price, stock, partNumber: pn });
|
|
}
|
|
}
|
|
|
|
return results;
|
|
});
|
|
|
|
for (const item of productData) {
|
|
if (!item.name || !item.price) continue;
|
|
|
|
const { price, currency } = parsePrice(item.price);
|
|
const speedInfo = detectSpeed(item.name);
|
|
|
|
if (price > 0) {
|
|
products.push({
|
|
partNumber: item.partNumber || item.name.slice(0, 50),
|
|
name: item.name,
|
|
price,
|
|
currency,
|
|
stockLevel: item.stock ? parseStockLevel(item.stock) : "on_request",
|
|
quantity: item.stock ? parseQuantity(item.stock) : undefined,
|
|
url: item.href.startsWith("http") ? item.href : `${BASE_URL}${item.href}`,
|
|
formFactor: detectFormFactor(item.name),
|
|
speedGbps: speedInfo?.speedGbps,
|
|
speed: speedInfo?.speed,
|
|
reachLabel: detectReach(item.name),
|
|
});
|
|
}
|
|
}
|
|
|
|
pagesScraped++;
|
|
log.info(` Found ${productData.length} items on page`);
|
|
},
|
|
});
|
|
|
|
const startUrls = CATEGORY_URLS.map((path) => `${BASE_URL}${path}`);
|
|
await crawler.run(startUrls);
|
|
|
|
console.log(`\nPages scraped: ${pagesScraped}`);
|
|
console.log(`Products found: ${products.length}`);
|
|
|
|
// Deduplicate by partNumber
|
|
const uniqueProducts = new Map<string, FsProduct>();
|
|
for (const p of products) {
|
|
const key = p.partNumber || p.name;
|
|
if (!uniqueProducts.has(key)) {
|
|
uniqueProducts.set(key, p);
|
|
}
|
|
}
|
|
|
|
// Write to database
|
|
let written = 0;
|
|
let skipped = 0;
|
|
|
|
for (const p of uniqueProducts.values()) {
|
|
try {
|
|
const transceiverId = await findOrCreateScrapedTransceiver({
|
|
partNumber: p.partNumber,
|
|
vendorId,
|
|
formFactor: p.formFactor,
|
|
speedGbps: p.speedGbps,
|
|
speed: p.speed,
|
|
reachLabel: p.reachLabel,
|
|
category: "DataCenter",
|
|
});
|
|
|
|
const hash = contentHash({ price: p.price, stock: p.stockLevel, qty: p.quantity });
|
|
const isNew = await upsertPriceObservation({
|
|
transceiverId,
|
|
sourceVendorId: vendorId,
|
|
price: p.price,
|
|
currency: p.currency,
|
|
stockLevel: p.stockLevel,
|
|
quantityAvailable: p.quantity,
|
|
url: p.url,
|
|
contentHash: hash,
|
|
});
|
|
|
|
if (isNew) written++;
|
|
else skipped++;
|
|
} catch (err) {
|
|
console.error(` Error: ${p.partNumber}:`, (err as Error).message);
|
|
}
|
|
}
|
|
|
|
console.log(`\nDatabase: ${written} new, ${skipped} unchanged (${uniqueProducts.size} unique)`);
|
|
|
|
// ═══ PHASE 2: Crawl product detail pages for VERIFIED specs ═══
|
|
console.log("\n=== Phase 2: Scraping product detail pages for verified specs ===");
|
|
|
|
// Get products that need spec verification (enriched_estimated or missing key fields)
|
|
const needsSpecs = await pool.query(
|
|
`SELECT t.id, t.part_number, t.slug FROM transceivers t
|
|
JOIN vendors v ON t.vendor_id = v.id
|
|
WHERE v.slug = 'fs-com'
|
|
AND (t.data_confidence = 'enriched_estimated' OR t.data_confidence = 'unknown'
|
|
OR t.connector IS NULL OR t.connector = '' OR t.connector = '-'
|
|
OR t.wavelengths IS NULL OR t.wavelengths = ''
|
|
OR t.fiber_type IS NULL OR t.fiber_type = '')
|
|
LIMIT 200`
|
|
);
|
|
console.log(`Products needing spec verification: ${needsSpecs.rows.length}`);
|
|
|
|
// Build a map of product URLs from our scraped data
|
|
const productUrls = new Map<string, string>(); // transceiver_id → product URL
|
|
for (const p of uniqueProducts.values()) {
|
|
// Find the transceiver in DB by part number
|
|
const match = await pool.query(
|
|
`SELECT id FROM transceivers WHERE part_number = $1 AND vendor_id = $2`,
|
|
[p.partNumber, vendorId]
|
|
).catch(() => ({ rows: [] }));
|
|
if (match.rows[0] && p.url) {
|
|
productUrls.set(match.rows[0].id, p.url);
|
|
}
|
|
}
|
|
|
|
let specsUpdated = 0;
|
|
const specCrawler = new PlaywrightCrawler({
|
|
maxConcurrency: 1,
|
|
maxRequestsPerMinute: 10,
|
|
requestHandlerTimeoutSecs: 45,
|
|
headless: true,
|
|
launchContext: {
|
|
launchOptions: {
|
|
args: ["--disable-blink-features=AutomationControlled", "--lang=en-US"],
|
|
},
|
|
},
|
|
preNavigationHooks: [
|
|
async ({ page }) => {
|
|
await page.context().addCookies([
|
|
{ name: "currency", value: "USD", domain: ".fs.com", path: "/" },
|
|
{ name: "lang", value: "en", domain: ".fs.com", path: "/" },
|
|
]);
|
|
},
|
|
],
|
|
async requestHandler({ page, request, log }) {
|
|
const transceiverId = request.userData?.transceiverId;
|
|
if (!transceiverId) return;
|
|
|
|
log.info(`Spec scrape: ${request.url}`);
|
|
await page.waitForTimeout(3000);
|
|
|
|
// Extract spec table from product detail page
|
|
const specData = await page.evaluate(() => {
|
|
const specs: Record<string, string> = {};
|
|
// fs.com uses various spec table formats
|
|
const rows = document.querySelectorAll(
|
|
".product-param tr, .product-specs tr, table.param-table tr, " +
|
|
".specifications tr, .detail-param tr, .prod-spec-list tr, " +
|
|
'[class*="specification"] tr, [class*="param"] tr'
|
|
);
|
|
for (const row of rows) {
|
|
const cells = row.querySelectorAll("td, th");
|
|
if (cells.length >= 2) {
|
|
const key = (cells[0]?.textContent || "").trim();
|
|
const val = (cells[1]?.textContent || "").trim();
|
|
if (key && val && key.length < 100) specs[key] = val;
|
|
}
|
|
}
|
|
|
|
// Also try dl/dt/dd pattern
|
|
const dts = document.querySelectorAll("dt, .spec-label, .param-label");
|
|
for (const dt of dts) {
|
|
const dd = dt.nextElementSibling;
|
|
if (dd && (dd.tagName === "DD" || dd.classList.contains("spec-value") || dd.classList.contains("param-value"))) {
|
|
const key = (dt.textContent || "").trim();
|
|
const val = (dd.textContent || "").trim();
|
|
if (key && val) specs[key] = val;
|
|
}
|
|
}
|
|
|
|
// Extract image
|
|
const img = document.querySelector('.product-image img, .prod-img img, [class*="gallery"] img, .product-detail img');
|
|
const imageUrl = img?.getAttribute("src") || "";
|
|
|
|
// Extract datasheet link
|
|
const dsLink = document.querySelector('a[href*="datasheet"], a[href*=".pdf"]');
|
|
const datasheetUrl = dsLink?.getAttribute("href") || "";
|
|
|
|
return { specs, imageUrl, datasheetUrl };
|
|
});
|
|
|
|
if (Object.keys(specData.specs).length > 0) {
|
|
const parsed = parseSpecTable(specData.specs);
|
|
const updated = await updateVerifiedSpecs({
|
|
transceiverId,
|
|
fiberType: parsed.fiberType,
|
|
connector: parsed.connector,
|
|
wavelengths: parsed.wavelengths,
|
|
reachMeters: parsed.reachMeters,
|
|
reachLabel: parsed.reachLabel,
|
|
powerConsumptionW: parsed.powerConsumptionW,
|
|
tempRange: parsed.tempRange,
|
|
modulation: parsed.modulation,
|
|
domSupport: parsed.domSupport,
|
|
imageUrl: specData.imageUrl ? (specData.imageUrl.startsWith("http") ? specData.imageUrl : `${BASE_URL}${specData.imageUrl}`) : undefined,
|
|
datasheetUrl: specData.datasheetUrl ? (specData.datasheetUrl.startsWith("http") ? specData.datasheetUrl : `${BASE_URL}${specData.datasheetUrl}`) : undefined,
|
|
source: "fs.com",
|
|
});
|
|
if (updated) specsUpdated++;
|
|
}
|
|
},
|
|
});
|
|
|
|
// Build spec crawl requests (limit to 200 per run to avoid rate limiting)
|
|
const specRequests = needsSpecs.rows
|
|
.filter(r => productUrls.has(r.id))
|
|
.slice(0, 200)
|
|
.map(r => ({
|
|
url: productUrls.get(r.id)!,
|
|
userData: { transceiverId: r.id },
|
|
}));
|
|
|
|
if (specRequests.length > 0) {
|
|
console.log(`Crawling ${specRequests.length} product detail pages for specs...`);
|
|
await specCrawler.run(specRequests);
|
|
console.log(`Specs verified: ${specsUpdated} products updated`);
|
|
} else {
|
|
console.log("No product URLs available for spec verification this run");
|
|
}
|
|
|
|
console.log("=== FS.com Scraper Complete ===\n");
|
|
}
|
|
|
|
if (require.main === module) {
|
|
scrapeFs()
|
|
.then(() => pool.end())
|
|
.catch((err) => {
|
|
console.error("Fatal:", err);
|
|
pool.end();
|
|
process.exit(1);
|
|
});
|
|
}
|