3 changed files with 22 additions and 287 deletions
--- a/packages/scraper/src/scrapers/fs-com.ts
+++ b/packages/scraper/src/scrapers/fs-com.ts
@ -58,7 +58,6 @@ import {
  upsertPriceObservation,
  upsertStockObservation,
  findOrCreateScrapedTransceiver,
  markImageVerified,
  pool,
 } from "../utils/db";
 import { contentHash } from "../utils/hash";
@ -74,7 +73,6 @@ const STOCK_FRESH_HOURS = parseInt(process.env["FS_STOCK_FRESH_HOURS"] ?? "12",
 const FORCE_REVALIDATE = process.env["TIP_FORCE_REVALIDATE"] === "1";
 const ONLY_MISSING_IMAGES = process.env["FS_ONLY_MISSING_IMAGES"] === "1";
 const DB_DETAIL_ONLY = process.env["FS_DB_DETAIL_ONLY"] === "1";
 const URL_DISCOVERY_ONLY = process.env["FS_URL_DISCOVERY_ONLY"] === "1";
 const PROXY_URLS = (process.env["PROXY_URLS"] ?? "")
  .split(",")
@ -264,7 +262,6 @@ interface ProductSummary {
  url: string;
  name: string;
  partNumber: string;
  targetTransceiverId?: string;
 }
 interface ProductDetail extends ProductSummary {
@ -397,7 +394,7 @@ async function collectProductUrls(
 // ── Phase 2: Scrape product detail pages ──────────────────────────────────────
 async function scrapeProductDetails(
-  requests: Array<{ url: string; userData: { name: string; partNumber: string; targetTransceiverId?: string } }>,
+  requests: Array<{ url: string; userData: { name: string; partNumber: string } }>,
  proxyConfiguration: ProxyConfiguration | undefined
 ): Promise<ProductDetail[]> {
  // Purge Phase 2 storage so it starts with a clean request queue
@ -428,9 +425,7 @@ async function scrapeProductDetails(
      const { name: listingName, partNumber: listingPn } = request.userData as {
        name: string;
        partNumber: string;
        targetTransceiverId?: string;
      };
      const { targetTransceiverId } = request.userData as { targetTransceiverId?: string };
      const url = request.url;
      try {
@ -737,7 +732,6 @@ async function scrapeProductDetails(
        specs: raw.specs,
        imageUrl: resolveUrl(raw.imageUrl),
        datasheetUrl: resolveUrl(raw.datasheetUrl),
        targetTransceiverId,
      });
    },
  }, makeCrawleeConfig("fs-phase2"));
@ -795,34 +789,11 @@ export async function scrapeFs(): Promise<void> {
  // ── Phase 1: Discover product URLs ─────────────────────────────────────────
  let productMap: Map<string, ProductSummary>;
-  if (URL_DISCOVERY_ONLY) {
+  if (DB_DETAIL_ONLY) {
    console.log("\n[Phase 1] URL discovery mode — probing FS.COM rows without product URLs…");
    const dbRows = await pool.query(
      `
      SELECT t.id, t.part_number
      FROM transceivers t
      JOIN vendors v ON v.id = t.vendor_id
      WHERE v.name = 'FS.COM'
        AND COALESCE(t.product_page_url, '') = ''
        AND t.part_number ~ '^FS-[0-9]+$'
      ORDER BY t.part_number
      LIMIT $1
    `,
      [MAX_DETAIL_PAGES_PER_RUN]
    );
    productMap = new Map(
      dbRows.rows.map((row) => {
        const partNumber = row.part_number as string;
        const productId = partNumber.replace(/^FS-/, "");
        const url = `${BASE_URL}/products/${productId}.html`;
        return [url, { url, name: partNumber, partNumber, targetTransceiverId: row.id as string }];
      })
    );
  } else if (DB_DETAIL_ONLY) {
    console.log("\n[Phase 1] DB detail-only mode — using existing FS.COM product URLs with missing verification signals…");
    const dbRows = await pool.query(
      `
-      SELECT t.id, t.part_number, t.product_page_url
+      SELECT t.part_number, t.product_page_url
      FROM transceivers t
      JOIN vendors v ON v.id = t.vendor_id
      WHERE v.name = 'FS.COM'
@ -849,7 +820,7 @@ export async function scrapeFs(): Promise<void> {
      dbRows.rows.map((row) => {
        const url = normalizeFsProductUrl(row.product_page_url as string);
        const partNumber = row.part_number as string;
-        return [url, { url, name: partNumber, partNumber, targetTransceiverId: row.id as string }];
+        return [url, { url, name: partNumber, partNumber }];
      })
    );
  } else {
@ -918,14 +889,7 @@ export async function scrapeFs(): Promise<void> {
  // ── Phase 2: Scrape detail pages ────────────────────────────────────────────
  const detailRequests = urlsToScrape.map((url) => {
    const s = productMap.get(url);
-    return {
+    return { url, userData: { name: s?.name ?? "FS.com Product", partNumber: s?.partNumber ?? "" } };
      url,
      userData: {
        name: s?.name ?? "FS.com Product",
        partNumber: s?.partNumber ?? "",
        targetTransceiverId: s?.targetTransceiverId,
      },
    };
  });
  const details = await scrapeProductDetails(detailRequests, proxyConfiguration);
@ -947,7 +911,7 @@ export async function scrapeFs(): Promise<void> {
      const textForInference = `${detail.name} ${detail.partNumber} ${Object.values(detail.specs).join(" ")}`;
      const fiberType = parsed.fiberType ?? detectFiberType(textForInference);
-      const transceiverId = detail.targetTransceiverId ?? (await findOrCreateScrapedTransceiver({
+      const transceiverId = await findOrCreateScrapedTransceiver({
        partNumber: detail.partNumber,
        vendorId,
        productUrl: detail.url,
@ -960,37 +924,7 @@ export async function scrapeFs(): Promise<void> {
        wavelengths: parsed.wavelengths,
        imageUrl: detail.imageUrl,
        category: "DataCenter",
-        }));
+      });
      if (detail.targetTransceiverId) {
        await pool.query(
          `UPDATE transceivers
           SET product_page_url = COALESCE(NULLIF(product_page_url, ''), $2),
               form_factor = COALESCE(NULLIF(form_factor, ''), $3),
               speed_gbps = CASE WHEN speed_gbps IS NULL OR speed_gbps = 0 THEN COALESCE($4, speed_gbps) ELSE speed_gbps END,
               speed = COALESCE(NULLIF(speed, ''), $5),
               reach_label = COALESCE(NULLIF(reach_label, ''), $6),
               reach_meters = CASE WHEN reach_meters IS NULL OR reach_meters = 0 THEN COALESCE($7, reach_meters) ELSE reach_meters END,
               fiber_type = COALESCE(NULLIF(fiber_type, ''), $8),
               wavelengths = COALESCE(NULLIF(wavelengths, ''), $9),
               updated_at = NOW()
           WHERE id = $1`,
          [
            transceiverId,
            detail.url,
            ff,
            speedInfo?.speedGbps ?? null,
            speedInfo?.speed ?? null,
            reach ?? parsed.reachLabel ?? null,
            parsed.reachMeters ?? null,
            fiberType ?? null,
            parsed.wavelengths ?? null,
          ]
        );
        if (detail.imageUrl) {
          await markImageVerified(transceiverId, detail.imageUrl);
        }
      }
      const stockLevel = deriveStockLevel(detail.deQty, detail.globalQty, detail.backorderQty);
      const totalQty = (detail.deQty ?? 0) + (detail.globalQty ?? 0);
@ -1036,18 +970,14 @@ export async function scrapeFs(): Promise<void> {
      });
      if (stockNew) stockWritten++;
-      const hasSourceDetails =
+      if (Object.keys(detail.specs).length > 0) {
        Object.keys(detail.specs).length > 0 ||
        Boolean(fiberType || parsed.connector || parsed.wavelengths || parsed.reachLabel || reach);
      if (hasSourceDetails) {
        const updated = await updateVerifiedSpecs({
          transceiverId,
          fiberType,
          connector: parsed.connector,
          wavelengths: parsed.wavelengths,
          reachMeters: parsed.reachMeters,
-          reachLabel: reach ?? parsed.reachLabel,
+          reachLabel: parsed.reachLabel,
          powerConsumptionW: parsed.powerConsumptionW,
          tempRange: parsed.tempRange,
          modulation: parsed.modulation,
--- a/sync/CURRENT.md
+++ b/sync/CURRENT.md
@ -1,86 +1,9 @@
 # Current TIP Sync State
-Updated: 2026-05-09 11:59 UTC
+Updated: 2026-05-09 09:18 UTC
 ## Newest Work
 - Priority Crawlee evaluation + FS.com URL discovery on 2026-05-09:
  - operator asked whether these repos help:
    - `https://github.com/apify/crawlee`
    - `https://github.com/apify/crawlee-python`
    - `https://github.com/hiteshchoudhary/crawlee-project`
  - evaluation:
    - `apify/crawlee` is directly relevant and already in use in TIP via TypeScript `PlaywrightCrawler`
    - current TIP benefit is not adding Crawlee, but using Crawlee more deliberately:
      - bounded RequestQueues
      - stable `uniqueKey`
      - explicit retry/no-text classes
      - isolated storage directories
      - AutoscaledPool telemetry as safety signal
      - hard concurrency caps on Erik
    - `apify/crawlee-python` is useful for future isolated Pi/Proxmox workers, especially for Python-native extraction experiments, but should not replace the current TypeScript scraper core today
    - `hiteshchoudhary/crawlee-project` is a small community/demo project, useful as inspiration only; not a production dependency for TIP
  - code improved:
    - `packages/scraper/src/scrapers/fs-com.ts`
      - added `FS_URL_DISCOVERY_ONLY=1`
      - maps existing `FS-<numeric-id>` rows without `product_page_url` to `https://www.fs.com/de/products/<id>.html`
      - carries `targetTransceiverId` through the crawler so verified source evidence updates the original row instead of creating duplicates
      - marks current FS.com product images verified for target rows
      - accepts deterministic H1/part/spec evidence for detail verification when FS.com does not expose a traditional spec table
  - live runs on Erik:
    - URL discovery pilot:
      - target `20`
      - scraped `19`
      - failed `0`
      - no-url rows dropped from `76` to `57`
    - full URL discovery:
      - target `56`
      - scraped `55`
      - failed `1` (`https://www.fs.com/de/products/229461.html`, transient `ERR_NETWORK_CHANGED`)
      - no-url rows dropped to `2`
    - DB reconciliation with improved detail evidence:
      - target `57`
      - scraped `55`
      - failed `0`
      - new prices `41`
      - stock observations `40`
      - specs verified `55`
    - `pnpm -C packages/scraper build` passed on Erik after the code change
  - FS.com final state after URL discovery:
    - total rows: `383`
    - price verified: `379`
    - image verified: `374`
    - details verified: `373`
    - price+image+details: `373`
    - fully verified: `205`
    - missing URL: `2`
    - missing image URL: `9`
    - missing reach label: `4`
    - missing fiber type: `9`
    - HTML product-like rows:
      - total `373`
      - image `372`
      - details `371`
      - complete `371`
    - no-url rows:
      - `Change`
      - `FS-229461`
    - category rows: `4`
  - TIP health after run:
    - status `healthy`
    - load status `ok`
    - memory used `13%`
    - global verified counters:
      - price `11557`
      - image `10711`
      - details `9929`
      - fully `8526`
  - training pool:
    - pushed `4d9a11c crawl: add fscom url discovery learning record`
  - truth:
    - FS.com is still not 100% complete
    - honest current claim: `371/373` HTML product-like rows complete; remaining work is small and classifiable
 - TIP FS.com / Fiberstore targeted verification push on 2026-05-09:
  - operator requested FS.com/Fiberstore next, with all crawler/scraper/robot learnings written to the TIPLLM training pool and no external AI
  - code improved:
--- a/sync/history/2026-05-09-crawlee-evaluation-and-fscom-url-discovery.md
+++ b/sync/history/2026-05-09-crawlee-evaluation-and-fscom-url-discovery.md
@ -1,118 +0,0 @@
 # Crawlee Evaluation and FS.com URL Discovery
 Date: 2026-05-09
 ## Question
 Operator asked with highest priority whether these repositories help TIP:
 - `https://github.com/apify/crawlee`
 - `https://github.com/apify/crawlee-python`
 - `https://github.com/hiteshchoudhary/crawlee-project`
 ## Evaluation
 `apify/crawlee` helps directly, but TIP already uses it in the TypeScript scraper stack. The priority is to harden our current usage rather than introduce a new crawler framework.
 Best immediate Crawlee practices for TIP:
 - keep per-vendor bounded runs
 - use stable `uniqueKey`/target IDs so retries do not create duplicate rows
 - keep Crawlee storage directories isolated per vendor/run class
 - record no-text and max-retry URLs as a separate retry class
 - use AutoscaledPool telemetry as a safety signal
 - keep Erik at low concurrency and move heavier work to Pi/Proxmox workers
 `apify/crawlee-python` is useful for future isolated worker experiments on Pi/Proxmox, especially where Python extraction libraries help. It should not replace the current TypeScript crawler core today.
 `hiteshchoudhary/crawlee-project` is a small community/demo app, not a production building block for TIP.
 ## Code
 Changed:
 - `packages/scraper/src/scrapers/fs-com.ts`
 Added:
 - `FS_URL_DISCOVERY_ONLY=1`
 - target row propagation with `targetTransceiverId`
 - image verification for target rows
 - H1/part/spec deterministic detail verification when FS.com lacks a spec table
 ## Live Runs
 URL discovery pilot:
 - target `20`
 - scraped `19`
 - failed `0`
 - no-url rows: `76` -> `57`
 Full URL discovery:
 - target `56`
 - scraped `55`
 - failed `1`
 - failed URL: `https://www.fs.com/de/products/229461.html`
 - no-url rows: `57` -> `2`
 DB reconciliation:
 - target `57`
 - scraped `55`
 - failed `0`
 - new prices `41`
 - stock observations `40`
 - specs verified `55`
 Build:
 - `pnpm -C packages/scraper build` passed on Erik
 ## FS.com Final State
 - total rows: `383`
 - price verified: `379`
 - image verified: `374`
 - details verified: `373`
 - price+image+details: `373`
 - fully verified: `205`
 - missing URL: `2`
 - missing image URL: `9`
 - missing reach label: `4`
 - missing fiber type: `9`
 - HTML product-like rows: `373`
 - HTML product-like complete: `371`
 - no-url rows: `2`
 - category rows: `4`
 Remaining no-url rows:
 - `Change`
 - `FS-229461`
 TIP health after run:
 - status: `healthy`
 - load status: `ok`
 - memory used: `13%`
 - global image verified: `10711`
 - global details verified: `9929`
 - global fully verified: `8526`
 ## Training Pool
 Pushed:
 - `4d9a11c crawl: add fscom url discovery learning record`
 ## Next
 Do not claim FS.com is 100% complete yet. Remaining work:
 - classify `Change`
 - retry or classify `FS-229461`
 - classify 4 category rows
 - close 9 image/fiber gaps
 - then move to next high-value competitor with the same bounded Crawlee pattern