diff --git a/packages/scraper/src/scheduler.ts b/packages/scraper/src/scheduler.ts index 6bb8fa1..5d039a5 100644 --- a/packages/scraper/src/scheduler.ts +++ b/packages/scraper/src/scheduler.ts @@ -14,6 +14,22 @@ import PgBoss from "pg-boss"; import { config } from "dotenv"; import { join } from "path"; +import { rmSync, mkdirSync } from "fs"; + +/** Run a scraper with an isolated Crawlee storage directory to prevent queue collisions */ +async function withIsolatedStorage(name: string, fn: () => Promise): Promise { + const dir = join(__dirname, "..", "..", "..", `storage-${name}`); + mkdirSync(dir, { recursive: true }); + const prev = process.env.CRAWLEE_STORAGE_DIR; + process.env.CRAWLEE_STORAGE_DIR = dir; + try { + await fn(); + } finally { + process.env.CRAWLEE_STORAGE_DIR = prev ?? ""; + // Clean up after successful run + try { rmSync(dir, { recursive: true, force: true }); } catch { /* ignore */ } + } +} config({ path: join(__dirname, "..", "..", "..", ".env") }); @@ -46,6 +62,7 @@ export async function registerSchedules(boss: PgBoss): Promise { "scrape:pricing:atgbics", "scrape:pricing:prolabs", "scrape:compat:cisco", + "scrape:pricing:flexoptix", "scrape:vendors:flexoptix", "scrape:news", "scrape:faq", @@ -103,6 +120,12 @@ export async function registerSchedules(boss: PgBoss): Promise { expireInSeconds: 3600, }); + // Flexoptix catalog (every 6 hours — fetch-based, fast) + await boss.schedule("scrape:pricing:flexoptix", "0 1/6 * * *", {}, { + retryLimit: 2, + expireInSeconds: 3600, + }); + // Flexoptix vendor list (weekly, Sunday at 6am — own data) await boss.schedule("scrape:vendors:flexoptix", "0 6 * * 0", {}, { retryLimit: 3, @@ -124,6 +147,7 @@ export async function registerWorkers(boss: PgBoss): Promise { const { scrapeCiscoTmg } = await import("./scrapers/cisco-tmg"); const { scrapeOptcore } = await import("./scrapers/optcore"); const { scrape10Gtek } = await import("./scrapers/tenGtek"); + const { scrapeFlexoptixCatalog } = await import("./scrapers/flexoptix-catalog"); const { scrapeFlexoptixVendors } = await import("./scrapers/flexoptix-vendors"); const { scrapeNews } = await import("./scrapers/news"); const { scrapeAtgbics } = await import("./scrapers/atgbics"); @@ -131,22 +155,27 @@ export async function registerWorkers(boss: PgBoss): Promise { await boss.work("scrape:pricing:fs", async (_job) => { console.log(`[${new Date().toISOString()}] Running: FS.com pricing`); - await scrapeFs(); + await withIsolatedStorage("fs", scrapeFs); }); await boss.work("scrape:pricing:optcore", async (_job) => { console.log(`[${new Date().toISOString()}] Running: Optcore pricing`); - await scrapeOptcore(); + await withIsolatedStorage("optcore", scrapeOptcore); }); await boss.work("scrape:compat:cisco", async (_job) => { console.log(`[${new Date().toISOString()}] Running: Cisco TMG`); - await scrapeCiscoTmg(); + await withIsolatedStorage("cisco", scrapeCiscoTmg); }); await boss.work("scrape:pricing:10gtek", async (_job) => { console.log(`[${new Date().toISOString()}] Running: 10Gtek pricing`); - await scrape10Gtek(); + await withIsolatedStorage("10gtek", scrape10Gtek); + }); + + await boss.work("scrape:pricing:flexoptix", async (_job) => { + console.log(`[${new Date().toISOString()}] Running: Flexoptix catalog pricing`); + await scrapeFlexoptixCatalog(); }); await boss.work("scrape:vendors:flexoptix", async (_job) => { @@ -161,12 +190,12 @@ export async function registerWorkers(boss: PgBoss): Promise { await boss.work("scrape:pricing:atgbics", async (_job) => { console.log(`[${new Date().toISOString()}] Running: ATGBICS pricing`); - await scrapeAtgbics(); + await withIsolatedStorage("atgbics", scrapeAtgbics); }); await boss.work("scrape:pricing:prolabs", async (_job) => { console.log(`[${new Date().toISOString()}] Running: ProLabs pricing`); - await scrapeProLabs(); + await withIsolatedStorage("prolabs", scrapeProLabs); }); await boss.work("scrape:faq", async (_job) => { diff --git a/packages/scraper/src/scrapers/champion-one.ts b/packages/scraper/src/scrapers/champion-one.ts index e8a1c6b..f613ef7 100644 --- a/packages/scraper/src/scrapers/champion-one.ts +++ b/packages/scraper/src/scrapers/champion-one.ts @@ -212,7 +212,7 @@ export async function scrapeChampionOne(): Promise { }); if (product.price && product.price > 0) { - const hash = contentHash(JSON.stringify({ price: product.price, part: product.partNumber })); + const hash = contentHash({ price: product.price, part: product.partNumber }); const updated = await upsertPriceObservation({ transceiverId: txId, sourceVendorId: vendorId, price: product.price, currency: product.currency || "USD", diff --git a/packages/scraper/src/scrapers/flexoptix-catalog.ts b/packages/scraper/src/scrapers/flexoptix-catalog.ts index ca2084f..ef023d9 100644 --- a/packages/scraper/src/scrapers/flexoptix-catalog.ts +++ b/packages/scraper/src/scrapers/flexoptix-catalog.ts @@ -541,7 +541,7 @@ export async function scrapeFlexoptixCatalog(): Promise { }); if (product.price && product.price > 0) { - const hash = contentHash(JSON.stringify({ price: product.price, part: product.partNumber })); + const hash = contentHash({ price: product.price, part: product.partNumber }); const updated = await upsertPriceObservation({ transceiverId: txId, sourceVendorId: vendorId, diff --git a/packages/scraper/src/scrapers/fluxlight.ts b/packages/scraper/src/scrapers/fluxlight.ts index 684cd83..c786e6e 100644 --- a/packages/scraper/src/scrapers/fluxlight.ts +++ b/packages/scraper/src/scrapers/fluxlight.ts @@ -210,7 +210,7 @@ export async function scrapeFluxlight(): Promise { }); if (product.price && product.price > 0) { - const hash = contentHash(JSON.stringify({ price: product.price, part: product.partNumber })); + const hash = contentHash({ price: product.price, part: product.partNumber }); const updated = await upsertPriceObservation({ transceiverId: txId, sourceVendorId: vendorId, price: product.price, currency: "USD", diff --git a/packages/scraper/src/scrapers/fs-com.ts b/packages/scraper/src/scrapers/fs-com.ts index 1bcf84f..87a1d4f 100644 --- a/packages/scraper/src/scrapers/fs-com.ts +++ b/packages/scraper/src/scrapers/fs-com.ts @@ -13,13 +13,18 @@ import { contentHash, parsePrice, parseStockLevel, parseQuantity } from "../util const BASE_URL = "https://www.fs.com"; const CATEGORY_URLS = [ - "/c/1g-sfp-modules-702", - "/c/10g-sfp-plus-modules-703", - "/c/25g-sfp28-modules-704", - "/c/40g-qsfp-plus-modules-705", - "/c/100g-qsfp28-modules-706", - "/c/400g-qsfp-dd-modules-3102", - "/c/800g-osfp-modules-3449", + "/c/1g-sfp-81", + "/c/10g-sfp-63", + "/c/25g-sfp28-3215", + "/c/40g-qsfp-1360", + "/c/100g-qsfp28-sfp-dd-1159", + "/c/200g-qsfp-dd-qsfp56-3542", + "/c/400g-osfp-qsfp112-qsfp-dd-3652", + "/c/800g-osfp-qsfp-dd-4089", + "/c/1.6t-osfp-5597", + "/c/400g-coherent-qsfp-dd-4103", + "/c/10g-cwdm-dwdm-sfp-65", + "/c/100g-dwdm-qsfp28-3863", ]; interface FsProduct { @@ -98,18 +103,30 @@ export async function scrapeFs(): Promise { headless: true, launchContext: { launchOptions: { - args: ["--disable-blink-features=AutomationControlled"], + args: ["--disable-blink-features=AutomationControlled", "--lang=en-US"], }, }, + preNavigationHooks: [ + async ({ page }) => { + await page.setExtraHTTPHeaders({ + "Accept-Language": "en-US,en;q=0.9", + }); + await page.context().addCookies([ + { name: "currency", value: "USD", domain: ".fs.com", path: "/" }, + { name: "lang", value: "en", domain: ".fs.com", path: "/" }, + { name: "country", value: "US", domain: ".fs.com", path: "/" }, + ]); + }, + ], + async requestHandler({ page, request, log }) { const url = request.url; log.info(`Scraping: ${url}`); - // Wait for product list to render - await page.waitForTimeout(3000); + // Wait for Vue.js product grid to render + await page.waitForTimeout(4000); - // Try multiple selectors — FS.com changes DOM frequently const productData = await page.evaluate(() => { const results: Array<{ name: string; @@ -119,65 +136,55 @@ export async function scrapeFs(): Promise { partNumber: string; }> = []; - // Strategy 1: Look for product links with prices nearby - const productLinks = document.querySelectorAll( - 'a[href*="/products/"], a[href*="/product/"], .product-item a, .o-list-product a, [class*="product"] a[href]' - ); + // Strategy 1: Parse .category__grid__item cards (2026 Vue.js DOM) + const gridItems = document.querySelectorAll(".category__grid__item"); + for (const item of gridItems) { + const link = item.querySelector('a[href*="/products/"]') as HTMLAnchorElement | null; + const img = item.querySelector("img"); + const priceEl = item.querySelector(".grid__price"); + const allText = item.textContent || ""; - for (const link of productLinks) { - const el = link as HTMLAnchorElement; - const name = el.textContent?.trim() || ""; - const href = el.getAttribute("href") || ""; + if (!link) continue; - if (!name || name.length < 5 || !href) continue; + const name = img?.getAttribute("alt")?.trim() || link.textContent?.trim() || ""; + const href = link.getAttribute("href") || ""; + const price = priceEl?.textContent?.trim() || ""; - // Find price in parent/sibling elements - const container = - el.closest('[class*="product"]') || - el.closest('[class*="item"]') || - el.closest("li") || - el.parentElement?.parentElement; + // Extract stock from text like "1914 in Global Warehouse" + const stockMatch = allText.match(/(\d+)\s+in\s+(?:Global\s+)?Warehouse/i); + const stock = stockMatch ? stockMatch[1] + " in stock" : ""; - let price = ""; - let stock = ""; + // Extract FS product ID from URL + const pnMatch = href.match(/products\/(\d+)\.html/); + const partNumber = pnMatch ? `FS-${pnMatch[1]}` : ""; - if (container) { - const priceEl = container.querySelector( - '[class*="price"], [class*="Price"], .o-price, span[data-price]' - ); - price = priceEl?.textContent?.trim() || ""; - - const stockEl = container.querySelector( - '[class*="stock"], [class*="Stock"], [class*="avail"], .o-stock' - ); - stock = stockEl?.textContent?.trim() || ""; - } - - // Extract part number from URL or text - const pn = href.split("/").pop()?.replace(".html", "")?.replace("#", "") || ""; - - if (name && (price || href.includes("/product"))) { - results.push({ name, href, price, stock, partNumber: pn }); + if (name && href) { + results.push({ name, href, price, stock, partNumber }); } } - // Strategy 2: Look for any element with $ or US$ price pattern + // Strategy 2: Fallback — look for product links with prices nearby if (results.length === 0) { - const allText = document.querySelectorAll("*"); - for (const el of allText) { - const text = el.textContent || ""; - if (/US?\$\s*\d+\.\d{2}/.test(text) && text.length < 200) { - const linkEl = el.closest("a") || el.querySelector("a"); - if (linkEl) { - results.push({ - name: linkEl.textContent?.trim() || text.slice(0, 100), - href: linkEl.getAttribute("href") || "", - price: text.match(/US?\$\s*[\d,.]+/)?.[0] || "", - stock: "", - partNumber: "", - }); - } + const productLinks = document.querySelectorAll( + 'a[href*="/products/"], a[href*="/product/"]' + ); + for (const link of productLinks) { + const el = link as HTMLAnchorElement; + const name = el.textContent?.trim() || ""; + const href = el.getAttribute("href") || ""; + if (!name || name.length < 5 || !href) continue; + + const container = el.closest('[class*="product"]') || el.closest('[class*="item"]') || el.closest("li") || el.parentElement?.parentElement; + let price = ""; + let stock = ""; + if (container) { + const priceEl = container.querySelector('[class*="price"]'); + price = priceEl?.textContent?.trim() || ""; + const stockEl = container.querySelector('[class*="stock"], [class*="avail"]'); + stock = stockEl?.textContent?.trim() || ""; } + const pn = href.split("/").pop()?.replace(".html", "")?.replace(/\?.*/, "") || ""; + if (name) results.push({ name, href, price, stock, partNumber: pn }); } } diff --git a/packages/scraper/src/scrapers/gbics.ts b/packages/scraper/src/scrapers/gbics.ts index 3bf34ab..dfe1a7d 100644 --- a/packages/scraper/src/scrapers/gbics.ts +++ b/packages/scraper/src/scrapers/gbics.ts @@ -196,7 +196,7 @@ export async function scrapeGbics(): Promise { }); if (product.price && product.price > 0) { - const hash = contentHash(JSON.stringify({ price: product.price, part: product.partNumber })); + const hash = contentHash({ price: product.price, part: product.partNumber }); const updated = await upsertPriceObservation({ transceiverId: txId, sourceVendorId: vendorId, price: product.price, currency: "GBP", diff --git a/packages/scraper/src/scrapers/sfpcables.ts b/packages/scraper/src/scrapers/sfpcables.ts index 46493d2..5e87754 100644 --- a/packages/scraper/src/scrapers/sfpcables.ts +++ b/packages/scraper/src/scrapers/sfpcables.ts @@ -203,7 +203,7 @@ export async function scrapeSfpCables(): Promise { }); if (product.price && product.price > 0) { - const hash = contentHash(JSON.stringify({ price: product.price, part: product.partNumber })); + const hash = contentHash({ price: product.price, part: product.partNumber }); const updated = await upsertPriceObservation({ transceiverId: txId, sourceVendorId: vendorId, diff --git a/packages/scraper/src/scrapers/tenGtek.ts b/packages/scraper/src/scrapers/tenGtek.ts index e30fdcb..59ae2cd 100644 --- a/packages/scraper/src/scrapers/tenGtek.ts +++ b/packages/scraper/src/scrapers/tenGtek.ts @@ -196,7 +196,7 @@ export async function scrape10Gtek(): Promise { }); if (product.price && product.price > 0) { - const hash = contentHash(JSON.stringify({ price: product.price, part: product.partNumber })); + const hash = contentHash({ price: product.price, part: product.partNumber }); const updated = await upsertPriceObservation({ transceiverId: txId, sourceVendorId: vendorId,