Rene Fichtmueller b43bdd3060 feat: TIP Phase 0+1 — monorepo, DB schema, API, scraper engine
Phase 0 - Foundation:
- Restructure into npm workspace monorepo (packages/core, api, scraper)
- PostgreSQL 17 + TimescaleDB schema (15 tables incl. hypertables)
- Docker Compose for local dev (PostgreSQL on 5433 + Qdrant)
- Express 5 API on port 3200 with 6 routes
- Seed script to migrate 159 transceivers + 42 standards from npm package
- Erik server setup script + PM2 ecosystem config

Phase 1 - Scraper Engine:
- Crawlee + Playwright framework with pg-boss scheduler
- FS.com scraper (PlaywrightCrawler, anti-bot workaround)
- Optcore.net scraper (WP REST API enumeration + PlaywrightCrawler)
  - Uses /wp-json/wp/v2/product to get 2000+ product URLs
  - Playwright renders individual product pages for price extraction
- Cisco TMG Matrix scraper (compatibility data)
- News RSS aggregator (optics.org, SPIE, Network World, Nature Photonics)
  - Keyword relevance scoring for transceiver/fiber topics
  - xml2js with malformed XML sanitization
- SHA-256 content hashing for change detection (skip unchanged records)
- pg-boss v10 with explicit queue creation before scheduling
2026-03-27 16:27:31 +13:00

72 lines
2.3 KiB
TypeScript

import { createHash } from "crypto";
/**
* Generate SHA-256 content hash for change detection.
* Only hashes the fields that matter (price, stock, quantity).
*/
export function contentHash(data: Record<string, unknown>): string {
const normalized = JSON.stringify(data, Object.keys(data).sort());
return createHash("sha256").update(normalized).digest("hex").slice(0, 16);
}
/**
* Parse price string into number.
* Handles: "$12.50", "12,50 €", "US$12.50", "12.50 USD"
*/
export function parsePrice(raw: string): { price: number; currency: string } {
const cleaned = raw.replace(/[^\d.,]/g, "").replace(",", ".");
const price = parseFloat(cleaned);
const currency = raw.includes("€")
? "EUR"
: raw.includes("£")
? "GBP"
: raw.includes("¥")
? "CNY"
: "USD";
return { price: isNaN(price) ? 0 : price, currency };
}
/**
* Determine stock level from various text representations.
*/
export function parseStockLevel(
raw: string
): "in_stock" | "low_stock" | "out_of_stock" | "on_request" | "discontinued" {
const lower = raw.toLowerCase();
if (lower.includes("in stock") || lower.includes("auf lager") || lower.includes("available"))
return "in_stock";
if (lower.includes("low stock") || lower.includes("few left") || lower.includes("limited"))
return "low_stock";
if (
lower.includes("out of stock") ||
lower.includes("sold out") ||
lower.includes("nicht verfügbar") ||
lower.includes("unavailable")
)
return "out_of_stock";
if (lower.includes("discontinued") || lower.includes("eol") || lower.includes("end of life"))
return "discontinued";
return "on_request";
}
/**
* Extract numeric quantity from stock text.
* "23 in stock" → 23, "500+ available" → 500
*/
export function parseQuantity(raw: string): number | undefined {
const match = raw.match(/(\d+)\+?\s*(in stock|available|auf lager|stück|units|pcs)/i);
return match ? parseInt(match[1]) : undefined;
}
/**
* Parse lead time from text.
* "Ships in 3-5 days" → 5, "2 weeks" → 14
*/
export function parseLeadTime(raw: string): number | undefined {
const dayMatch = raw.match(/(\d+)\s*(business\s+)?days?/i);
if (dayMatch) return parseInt(dayMatch[1]);
const weekMatch = raw.match(/(\d+)\s*weeks?/i);
if (weekMatch) return parseInt(weekMatch[1]) * 7;
return undefined;
}