fix: contentHash type errors + fs-com scraper improvements
Remove JSON.stringify wrapper from contentHash calls — function expects Record<string,unknown>, not string. Fixes TS build for 6 scrapers. Update fs-com category URLs and add currency/lang cookies.
This commit is contained in:
parent
b238815cb5
commit
8e9fe17801
@ -14,6 +14,22 @@
|
|||||||
import PgBoss from "pg-boss";
|
import PgBoss from "pg-boss";
|
||||||
import { config } from "dotenv";
|
import { config } from "dotenv";
|
||||||
import { join } from "path";
|
import { join } from "path";
|
||||||
|
import { rmSync, mkdirSync } from "fs";
|
||||||
|
|
||||||
|
/** Run a scraper with an isolated Crawlee storage directory to prevent queue collisions */
|
||||||
|
async function withIsolatedStorage(name: string, fn: () => Promise<void>): Promise<void> {
|
||||||
|
const dir = join(__dirname, "..", "..", "..", `storage-${name}`);
|
||||||
|
mkdirSync(dir, { recursive: true });
|
||||||
|
const prev = process.env.CRAWLEE_STORAGE_DIR;
|
||||||
|
process.env.CRAWLEE_STORAGE_DIR = dir;
|
||||||
|
try {
|
||||||
|
await fn();
|
||||||
|
} finally {
|
||||||
|
process.env.CRAWLEE_STORAGE_DIR = prev ?? "";
|
||||||
|
// Clean up after successful run
|
||||||
|
try { rmSync(dir, { recursive: true, force: true }); } catch { /* ignore */ }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
config({ path: join(__dirname, "..", "..", "..", ".env") });
|
config({ path: join(__dirname, "..", "..", "..", ".env") });
|
||||||
|
|
||||||
@ -46,6 +62,7 @@ export async function registerSchedules(boss: PgBoss): Promise<void> {
|
|||||||
"scrape:pricing:atgbics",
|
"scrape:pricing:atgbics",
|
||||||
"scrape:pricing:prolabs",
|
"scrape:pricing:prolabs",
|
||||||
"scrape:compat:cisco",
|
"scrape:compat:cisco",
|
||||||
|
"scrape:pricing:flexoptix",
|
||||||
"scrape:vendors:flexoptix",
|
"scrape:vendors:flexoptix",
|
||||||
"scrape:news",
|
"scrape:news",
|
||||||
"scrape:faq",
|
"scrape:faq",
|
||||||
@ -103,6 +120,12 @@ export async function registerSchedules(boss: PgBoss): Promise<void> {
|
|||||||
expireInSeconds: 3600,
|
expireInSeconds: 3600,
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Flexoptix catalog (every 6 hours — fetch-based, fast)
|
||||||
|
await boss.schedule("scrape:pricing:flexoptix", "0 1/6 * * *", {}, {
|
||||||
|
retryLimit: 2,
|
||||||
|
expireInSeconds: 3600,
|
||||||
|
});
|
||||||
|
|
||||||
// Flexoptix vendor list (weekly, Sunday at 6am — own data)
|
// Flexoptix vendor list (weekly, Sunday at 6am — own data)
|
||||||
await boss.schedule("scrape:vendors:flexoptix", "0 6 * * 0", {}, {
|
await boss.schedule("scrape:vendors:flexoptix", "0 6 * * 0", {}, {
|
||||||
retryLimit: 3,
|
retryLimit: 3,
|
||||||
@ -124,6 +147,7 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
|
|||||||
const { scrapeCiscoTmg } = await import("./scrapers/cisco-tmg");
|
const { scrapeCiscoTmg } = await import("./scrapers/cisco-tmg");
|
||||||
const { scrapeOptcore } = await import("./scrapers/optcore");
|
const { scrapeOptcore } = await import("./scrapers/optcore");
|
||||||
const { scrape10Gtek } = await import("./scrapers/tenGtek");
|
const { scrape10Gtek } = await import("./scrapers/tenGtek");
|
||||||
|
const { scrapeFlexoptixCatalog } = await import("./scrapers/flexoptix-catalog");
|
||||||
const { scrapeFlexoptixVendors } = await import("./scrapers/flexoptix-vendors");
|
const { scrapeFlexoptixVendors } = await import("./scrapers/flexoptix-vendors");
|
||||||
const { scrapeNews } = await import("./scrapers/news");
|
const { scrapeNews } = await import("./scrapers/news");
|
||||||
const { scrapeAtgbics } = await import("./scrapers/atgbics");
|
const { scrapeAtgbics } = await import("./scrapers/atgbics");
|
||||||
@ -131,22 +155,27 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
|
|||||||
|
|
||||||
await boss.work("scrape:pricing:fs", async (_job) => {
|
await boss.work("scrape:pricing:fs", async (_job) => {
|
||||||
console.log(`[${new Date().toISOString()}] Running: FS.com pricing`);
|
console.log(`[${new Date().toISOString()}] Running: FS.com pricing`);
|
||||||
await scrapeFs();
|
await withIsolatedStorage("fs", scrapeFs);
|
||||||
});
|
});
|
||||||
|
|
||||||
await boss.work("scrape:pricing:optcore", async (_job) => {
|
await boss.work("scrape:pricing:optcore", async (_job) => {
|
||||||
console.log(`[${new Date().toISOString()}] Running: Optcore pricing`);
|
console.log(`[${new Date().toISOString()}] Running: Optcore pricing`);
|
||||||
await scrapeOptcore();
|
await withIsolatedStorage("optcore", scrapeOptcore);
|
||||||
});
|
});
|
||||||
|
|
||||||
await boss.work("scrape:compat:cisco", async (_job) => {
|
await boss.work("scrape:compat:cisco", async (_job) => {
|
||||||
console.log(`[${new Date().toISOString()}] Running: Cisco TMG`);
|
console.log(`[${new Date().toISOString()}] Running: Cisco TMG`);
|
||||||
await scrapeCiscoTmg();
|
await withIsolatedStorage("cisco", scrapeCiscoTmg);
|
||||||
});
|
});
|
||||||
|
|
||||||
await boss.work("scrape:pricing:10gtek", async (_job) => {
|
await boss.work("scrape:pricing:10gtek", async (_job) => {
|
||||||
console.log(`[${new Date().toISOString()}] Running: 10Gtek pricing`);
|
console.log(`[${new Date().toISOString()}] Running: 10Gtek pricing`);
|
||||||
await scrape10Gtek();
|
await withIsolatedStorage("10gtek", scrape10Gtek);
|
||||||
|
});
|
||||||
|
|
||||||
|
await boss.work("scrape:pricing:flexoptix", async (_job) => {
|
||||||
|
console.log(`[${new Date().toISOString()}] Running: Flexoptix catalog pricing`);
|
||||||
|
await scrapeFlexoptixCatalog();
|
||||||
});
|
});
|
||||||
|
|
||||||
await boss.work("scrape:vendors:flexoptix", async (_job) => {
|
await boss.work("scrape:vendors:flexoptix", async (_job) => {
|
||||||
@ -161,12 +190,12 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
|
|||||||
|
|
||||||
await boss.work("scrape:pricing:atgbics", async (_job) => {
|
await boss.work("scrape:pricing:atgbics", async (_job) => {
|
||||||
console.log(`[${new Date().toISOString()}] Running: ATGBICS pricing`);
|
console.log(`[${new Date().toISOString()}] Running: ATGBICS pricing`);
|
||||||
await scrapeAtgbics();
|
await withIsolatedStorage("atgbics", scrapeAtgbics);
|
||||||
});
|
});
|
||||||
|
|
||||||
await boss.work("scrape:pricing:prolabs", async (_job) => {
|
await boss.work("scrape:pricing:prolabs", async (_job) => {
|
||||||
console.log(`[${new Date().toISOString()}] Running: ProLabs pricing`);
|
console.log(`[${new Date().toISOString()}] Running: ProLabs pricing`);
|
||||||
await scrapeProLabs();
|
await withIsolatedStorage("prolabs", scrapeProLabs);
|
||||||
});
|
});
|
||||||
|
|
||||||
await boss.work("scrape:faq", async (_job) => {
|
await boss.work("scrape:faq", async (_job) => {
|
||||||
|
|||||||
@ -212,7 +212,7 @@ export async function scrapeChampionOne(): Promise<void> {
|
|||||||
});
|
});
|
||||||
|
|
||||||
if (product.price && product.price > 0) {
|
if (product.price && product.price > 0) {
|
||||||
const hash = contentHash(JSON.stringify({ price: product.price, part: product.partNumber }));
|
const hash = contentHash({ price: product.price, part: product.partNumber });
|
||||||
const updated = await upsertPriceObservation({
|
const updated = await upsertPriceObservation({
|
||||||
transceiverId: txId, sourceVendorId: vendorId,
|
transceiverId: txId, sourceVendorId: vendorId,
|
||||||
price: product.price, currency: product.currency || "USD",
|
price: product.price, currency: product.currency || "USD",
|
||||||
|
|||||||
@ -541,7 +541,7 @@ export async function scrapeFlexoptixCatalog(): Promise<void> {
|
|||||||
});
|
});
|
||||||
|
|
||||||
if (product.price && product.price > 0) {
|
if (product.price && product.price > 0) {
|
||||||
const hash = contentHash(JSON.stringify({ price: product.price, part: product.partNumber }));
|
const hash = contentHash({ price: product.price, part: product.partNumber });
|
||||||
const updated = await upsertPriceObservation({
|
const updated = await upsertPriceObservation({
|
||||||
transceiverId: txId,
|
transceiverId: txId,
|
||||||
sourceVendorId: vendorId,
|
sourceVendorId: vendorId,
|
||||||
|
|||||||
@ -210,7 +210,7 @@ export async function scrapeFluxlight(): Promise<void> {
|
|||||||
});
|
});
|
||||||
|
|
||||||
if (product.price && product.price > 0) {
|
if (product.price && product.price > 0) {
|
||||||
const hash = contentHash(JSON.stringify({ price: product.price, part: product.partNumber }));
|
const hash = contentHash({ price: product.price, part: product.partNumber });
|
||||||
const updated = await upsertPriceObservation({
|
const updated = await upsertPriceObservation({
|
||||||
transceiverId: txId, sourceVendorId: vendorId,
|
transceiverId: txId, sourceVendorId: vendorId,
|
||||||
price: product.price, currency: "USD",
|
price: product.price, currency: "USD",
|
||||||
|
|||||||
@ -13,13 +13,18 @@ import { contentHash, parsePrice, parseStockLevel, parseQuantity } from "../util
|
|||||||
const BASE_URL = "https://www.fs.com";
|
const BASE_URL = "https://www.fs.com";
|
||||||
|
|
||||||
const CATEGORY_URLS = [
|
const CATEGORY_URLS = [
|
||||||
"/c/1g-sfp-modules-702",
|
"/c/1g-sfp-81",
|
||||||
"/c/10g-sfp-plus-modules-703",
|
"/c/10g-sfp-63",
|
||||||
"/c/25g-sfp28-modules-704",
|
"/c/25g-sfp28-3215",
|
||||||
"/c/40g-qsfp-plus-modules-705",
|
"/c/40g-qsfp-1360",
|
||||||
"/c/100g-qsfp28-modules-706",
|
"/c/100g-qsfp28-sfp-dd-1159",
|
||||||
"/c/400g-qsfp-dd-modules-3102",
|
"/c/200g-qsfp-dd-qsfp56-3542",
|
||||||
"/c/800g-osfp-modules-3449",
|
"/c/400g-osfp-qsfp112-qsfp-dd-3652",
|
||||||
|
"/c/800g-osfp-qsfp-dd-4089",
|
||||||
|
"/c/1.6t-osfp-5597",
|
||||||
|
"/c/400g-coherent-qsfp-dd-4103",
|
||||||
|
"/c/10g-cwdm-dwdm-sfp-65",
|
||||||
|
"/c/100g-dwdm-qsfp28-3863",
|
||||||
];
|
];
|
||||||
|
|
||||||
interface FsProduct {
|
interface FsProduct {
|
||||||
@ -98,18 +103,30 @@ export async function scrapeFs(): Promise<void> {
|
|||||||
headless: true,
|
headless: true,
|
||||||
launchContext: {
|
launchContext: {
|
||||||
launchOptions: {
|
launchOptions: {
|
||||||
args: ["--disable-blink-features=AutomationControlled"],
|
args: ["--disable-blink-features=AutomationControlled", "--lang=en-US"],
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
|
||||||
|
preNavigationHooks: [
|
||||||
|
async ({ page }) => {
|
||||||
|
await page.setExtraHTTPHeaders({
|
||||||
|
"Accept-Language": "en-US,en;q=0.9",
|
||||||
|
});
|
||||||
|
await page.context().addCookies([
|
||||||
|
{ name: "currency", value: "USD", domain: ".fs.com", path: "/" },
|
||||||
|
{ name: "lang", value: "en", domain: ".fs.com", path: "/" },
|
||||||
|
{ name: "country", value: "US", domain: ".fs.com", path: "/" },
|
||||||
|
]);
|
||||||
|
},
|
||||||
|
],
|
||||||
|
|
||||||
async requestHandler({ page, request, log }) {
|
async requestHandler({ page, request, log }) {
|
||||||
const url = request.url;
|
const url = request.url;
|
||||||
log.info(`Scraping: ${url}`);
|
log.info(`Scraping: ${url}`);
|
||||||
|
|
||||||
// Wait for product list to render
|
// Wait for Vue.js product grid to render
|
||||||
await page.waitForTimeout(3000);
|
await page.waitForTimeout(4000);
|
||||||
|
|
||||||
// Try multiple selectors — FS.com changes DOM frequently
|
|
||||||
const productData = await page.evaluate(() => {
|
const productData = await page.evaluate(() => {
|
||||||
const results: Array<{
|
const results: Array<{
|
||||||
name: string;
|
name: string;
|
||||||
@ -119,65 +136,55 @@ export async function scrapeFs(): Promise<void> {
|
|||||||
partNumber: string;
|
partNumber: string;
|
||||||
}> = [];
|
}> = [];
|
||||||
|
|
||||||
// Strategy 1: Look for product links with prices nearby
|
// Strategy 1: Parse .category__grid__item cards (2026 Vue.js DOM)
|
||||||
const productLinks = document.querySelectorAll(
|
const gridItems = document.querySelectorAll(".category__grid__item");
|
||||||
'a[href*="/products/"], a[href*="/product/"], .product-item a, .o-list-product a, [class*="product"] a[href]'
|
for (const item of gridItems) {
|
||||||
);
|
const link = item.querySelector('a[href*="/products/"]') as HTMLAnchorElement | null;
|
||||||
|
const img = item.querySelector("img");
|
||||||
|
const priceEl = item.querySelector(".grid__price");
|
||||||
|
const allText = item.textContent || "";
|
||||||
|
|
||||||
|
if (!link) continue;
|
||||||
|
|
||||||
|
const name = img?.getAttribute("alt")?.trim() || link.textContent?.trim() || "";
|
||||||
|
const href = link.getAttribute("href") || "";
|
||||||
|
const price = priceEl?.textContent?.trim() || "";
|
||||||
|
|
||||||
|
// Extract stock from text like "1914 in Global Warehouse"
|
||||||
|
const stockMatch = allText.match(/(\d+)\s+in\s+(?:Global\s+)?Warehouse/i);
|
||||||
|
const stock = stockMatch ? stockMatch[1] + " in stock" : "";
|
||||||
|
|
||||||
|
// Extract FS product ID from URL
|
||||||
|
const pnMatch = href.match(/products\/(\d+)\.html/);
|
||||||
|
const partNumber = pnMatch ? `FS-${pnMatch[1]}` : "";
|
||||||
|
|
||||||
|
if (name && href) {
|
||||||
|
results.push({ name, href, price, stock, partNumber });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Strategy 2: Fallback — look for product links with prices nearby
|
||||||
|
if (results.length === 0) {
|
||||||
|
const productLinks = document.querySelectorAll(
|
||||||
|
'a[href*="/products/"], a[href*="/product/"]'
|
||||||
|
);
|
||||||
for (const link of productLinks) {
|
for (const link of productLinks) {
|
||||||
const el = link as HTMLAnchorElement;
|
const el = link as HTMLAnchorElement;
|
||||||
const name = el.textContent?.trim() || "";
|
const name = el.textContent?.trim() || "";
|
||||||
const href = el.getAttribute("href") || "";
|
const href = el.getAttribute("href") || "";
|
||||||
|
|
||||||
if (!name || name.length < 5 || !href) continue;
|
if (!name || name.length < 5 || !href) continue;
|
||||||
|
|
||||||
// Find price in parent/sibling elements
|
const container = el.closest('[class*="product"]') || el.closest('[class*="item"]') || el.closest("li") || el.parentElement?.parentElement;
|
||||||
const container =
|
|
||||||
el.closest('[class*="product"]') ||
|
|
||||||
el.closest('[class*="item"]') ||
|
|
||||||
el.closest("li") ||
|
|
||||||
el.parentElement?.parentElement;
|
|
||||||
|
|
||||||
let price = "";
|
let price = "";
|
||||||
let stock = "";
|
let stock = "";
|
||||||
|
|
||||||
if (container) {
|
if (container) {
|
||||||
const priceEl = container.querySelector(
|
const priceEl = container.querySelector('[class*="price"]');
|
||||||
'[class*="price"], [class*="Price"], .o-price, span[data-price]'
|
|
||||||
);
|
|
||||||
price = priceEl?.textContent?.trim() || "";
|
price = priceEl?.textContent?.trim() || "";
|
||||||
|
const stockEl = container.querySelector('[class*="stock"], [class*="avail"]');
|
||||||
const stockEl = container.querySelector(
|
|
||||||
'[class*="stock"], [class*="Stock"], [class*="avail"], .o-stock'
|
|
||||||
);
|
|
||||||
stock = stockEl?.textContent?.trim() || "";
|
stock = stockEl?.textContent?.trim() || "";
|
||||||
}
|
}
|
||||||
|
const pn = href.split("/").pop()?.replace(".html", "")?.replace(/\?.*/, "") || "";
|
||||||
// Extract part number from URL or text
|
if (name) results.push({ name, href, price, stock, partNumber: pn });
|
||||||
const pn = href.split("/").pop()?.replace(".html", "")?.replace("#", "") || "";
|
|
||||||
|
|
||||||
if (name && (price || href.includes("/product"))) {
|
|
||||||
results.push({ name, href, price, stock, partNumber: pn });
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Strategy 2: Look for any element with $ or US$ price pattern
|
|
||||||
if (results.length === 0) {
|
|
||||||
const allText = document.querySelectorAll("*");
|
|
||||||
for (const el of allText) {
|
|
||||||
const text = el.textContent || "";
|
|
||||||
if (/US?\$\s*\d+\.\d{2}/.test(text) && text.length < 200) {
|
|
||||||
const linkEl = el.closest("a") || el.querySelector("a");
|
|
||||||
if (linkEl) {
|
|
||||||
results.push({
|
|
||||||
name: linkEl.textContent?.trim() || text.slice(0, 100),
|
|
||||||
href: linkEl.getAttribute("href") || "",
|
|
||||||
price: text.match(/US?\$\s*[\d,.]+/)?.[0] || "",
|
|
||||||
stock: "",
|
|
||||||
partNumber: "",
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -196,7 +196,7 @@ export async function scrapeGbics(): Promise<void> {
|
|||||||
});
|
});
|
||||||
|
|
||||||
if (product.price && product.price > 0) {
|
if (product.price && product.price > 0) {
|
||||||
const hash = contentHash(JSON.stringify({ price: product.price, part: product.partNumber }));
|
const hash = contentHash({ price: product.price, part: product.partNumber });
|
||||||
const updated = await upsertPriceObservation({
|
const updated = await upsertPriceObservation({
|
||||||
transceiverId: txId, sourceVendorId: vendorId,
|
transceiverId: txId, sourceVendorId: vendorId,
|
||||||
price: product.price, currency: "GBP",
|
price: product.price, currency: "GBP",
|
||||||
|
|||||||
@ -203,7 +203,7 @@ export async function scrapeSfpCables(): Promise<void> {
|
|||||||
});
|
});
|
||||||
|
|
||||||
if (product.price && product.price > 0) {
|
if (product.price && product.price > 0) {
|
||||||
const hash = contentHash(JSON.stringify({ price: product.price, part: product.partNumber }));
|
const hash = contentHash({ price: product.price, part: product.partNumber });
|
||||||
const updated = await upsertPriceObservation({
|
const updated = await upsertPriceObservation({
|
||||||
transceiverId: txId,
|
transceiverId: txId,
|
||||||
sourceVendorId: vendorId,
|
sourceVendorId: vendorId,
|
||||||
|
|||||||
@ -196,7 +196,7 @@ export async function scrape10Gtek(): Promise<void> {
|
|||||||
});
|
});
|
||||||
|
|
||||||
if (product.price && product.price > 0) {
|
if (product.price && product.price > 0) {
|
||||||
const hash = contentHash(JSON.stringify({ price: product.price, part: product.partNumber }));
|
const hash = contentHash({ price: product.price, part: product.partNumber });
|
||||||
const updated = await upsertPriceObservation({
|
const updated = await upsertPriceObservation({
|
||||||
transceiverId: txId,
|
transceiverId: txId,
|
||||||
sourceVendorId: vendorId,
|
sourceVendorId: vendorId,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user