Phase 0 - Foundation: - Restructure into npm workspace monorepo (packages/core, api, scraper) - PostgreSQL 17 + TimescaleDB schema (15 tables incl. hypertables) - Docker Compose for local dev (PostgreSQL on 5433 + Qdrant) - Express 5 API on port 3200 with 6 routes - Seed script to migrate 159 transceivers + 42 standards from npm package - Erik server setup script + PM2 ecosystem config Phase 1 - Scraper Engine: - Crawlee + Playwright framework with pg-boss scheduler - FS.com scraper (PlaywrightCrawler, anti-bot workaround) - Optcore.net scraper (WP REST API enumeration + PlaywrightCrawler) - Uses /wp-json/wp/v2/product to get 2000+ product URLs - Playwright renders individual product pages for price extraction - Cisco TMG Matrix scraper (compatibility data) - News RSS aggregator (optics.org, SPIE, Network World, Nature Photonics) - Keyword relevance scoring for transceiver/fiber topics - xml2js with malformed XML sanitization - SHA-256 content hashing for change detection (skip unchanged records) - pg-boss v10 with explicit queue creation before scheduling
72 lines
2.3 KiB
TypeScript
72 lines
2.3 KiB
TypeScript
import { createHash } from "crypto";
|
|
|
|
/**
|
|
* Generate SHA-256 content hash for change detection.
|
|
* Only hashes the fields that matter (price, stock, quantity).
|
|
*/
|
|
export function contentHash(data: Record<string, unknown>): string {
|
|
const normalized = JSON.stringify(data, Object.keys(data).sort());
|
|
return createHash("sha256").update(normalized).digest("hex").slice(0, 16);
|
|
}
|
|
|
|
/**
|
|
* Parse price string into number.
|
|
* Handles: "$12.50", "12,50 €", "US$12.50", "12.50 USD"
|
|
*/
|
|
export function parsePrice(raw: string): { price: number; currency: string } {
|
|
const cleaned = raw.replace(/[^\d.,]/g, "").replace(",", ".");
|
|
const price = parseFloat(cleaned);
|
|
const currency = raw.includes("€")
|
|
? "EUR"
|
|
: raw.includes("£")
|
|
? "GBP"
|
|
: raw.includes("¥")
|
|
? "CNY"
|
|
: "USD";
|
|
return { price: isNaN(price) ? 0 : price, currency };
|
|
}
|
|
|
|
/**
|
|
* Determine stock level from various text representations.
|
|
*/
|
|
export function parseStockLevel(
|
|
raw: string
|
|
): "in_stock" | "low_stock" | "out_of_stock" | "on_request" | "discontinued" {
|
|
const lower = raw.toLowerCase();
|
|
if (lower.includes("in stock") || lower.includes("auf lager") || lower.includes("available"))
|
|
return "in_stock";
|
|
if (lower.includes("low stock") || lower.includes("few left") || lower.includes("limited"))
|
|
return "low_stock";
|
|
if (
|
|
lower.includes("out of stock") ||
|
|
lower.includes("sold out") ||
|
|
lower.includes("nicht verfügbar") ||
|
|
lower.includes("unavailable")
|
|
)
|
|
return "out_of_stock";
|
|
if (lower.includes("discontinued") || lower.includes("eol") || lower.includes("end of life"))
|
|
return "discontinued";
|
|
return "on_request";
|
|
}
|
|
|
|
/**
|
|
* Extract numeric quantity from stock text.
|
|
* "23 in stock" → 23, "500+ available" → 500
|
|
*/
|
|
export function parseQuantity(raw: string): number | undefined {
|
|
const match = raw.match(/(\d+)\+?\s*(in stock|available|auf lager|stück|units|pcs)/i);
|
|
return match ? parseInt(match[1]) : undefined;
|
|
}
|
|
|
|
/**
|
|
* Parse lead time from text.
|
|
* "Ships in 3-5 days" → 5, "2 weeks" → 14
|
|
*/
|
|
export function parseLeadTime(raw: string): number | undefined {
|
|
const dayMatch = raw.match(/(\d+)\s*(business\s+)?days?/i);
|
|
if (dayMatch) return parseInt(dayMatch[1]);
|
|
const weekMatch = raw.match(/(\d+)\s*weeks?/i);
|
|
if (weekMatch) return parseInt(weekMatch[1]) * 7;
|
|
return undefined;
|
|
}
|