Compare commits
No commits in common. "6ee10bf3011285171bbae5cea07807894a010901" and "7ddc439315e29ae32b9be7b5742b3675a9bf7b5e" have entirely different histories.
6ee10bf301
...
7ddc439315
@ -58,7 +58,6 @@ import {
|
|||||||
upsertPriceObservation,
|
upsertPriceObservation,
|
||||||
upsertStockObservation,
|
upsertStockObservation,
|
||||||
findOrCreateScrapedTransceiver,
|
findOrCreateScrapedTransceiver,
|
||||||
markImageVerified,
|
|
||||||
pool,
|
pool,
|
||||||
} from "../utils/db";
|
} from "../utils/db";
|
||||||
import { contentHash } from "../utils/hash";
|
import { contentHash } from "../utils/hash";
|
||||||
@ -74,7 +73,6 @@ const STOCK_FRESH_HOURS = parseInt(process.env["FS_STOCK_FRESH_HOURS"] ?? "12",
|
|||||||
const FORCE_REVALIDATE = process.env["TIP_FORCE_REVALIDATE"] === "1";
|
const FORCE_REVALIDATE = process.env["TIP_FORCE_REVALIDATE"] === "1";
|
||||||
const ONLY_MISSING_IMAGES = process.env["FS_ONLY_MISSING_IMAGES"] === "1";
|
const ONLY_MISSING_IMAGES = process.env["FS_ONLY_MISSING_IMAGES"] === "1";
|
||||||
const DB_DETAIL_ONLY = process.env["FS_DB_DETAIL_ONLY"] === "1";
|
const DB_DETAIL_ONLY = process.env["FS_DB_DETAIL_ONLY"] === "1";
|
||||||
const URL_DISCOVERY_ONLY = process.env["FS_URL_DISCOVERY_ONLY"] === "1";
|
|
||||||
|
|
||||||
const PROXY_URLS = (process.env["PROXY_URLS"] ?? "")
|
const PROXY_URLS = (process.env["PROXY_URLS"] ?? "")
|
||||||
.split(",")
|
.split(",")
|
||||||
@ -264,7 +262,6 @@ interface ProductSummary {
|
|||||||
url: string;
|
url: string;
|
||||||
name: string;
|
name: string;
|
||||||
partNumber: string;
|
partNumber: string;
|
||||||
targetTransceiverId?: string;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
interface ProductDetail extends ProductSummary {
|
interface ProductDetail extends ProductSummary {
|
||||||
@ -397,7 +394,7 @@ async function collectProductUrls(
|
|||||||
// ── Phase 2: Scrape product detail pages ──────────────────────────────────────
|
// ── Phase 2: Scrape product detail pages ──────────────────────────────────────
|
||||||
|
|
||||||
async function scrapeProductDetails(
|
async function scrapeProductDetails(
|
||||||
requests: Array<{ url: string; userData: { name: string; partNumber: string; targetTransceiverId?: string } }>,
|
requests: Array<{ url: string; userData: { name: string; partNumber: string } }>,
|
||||||
proxyConfiguration: ProxyConfiguration | undefined
|
proxyConfiguration: ProxyConfiguration | undefined
|
||||||
): Promise<ProductDetail[]> {
|
): Promise<ProductDetail[]> {
|
||||||
// Purge Phase 2 storage so it starts with a clean request queue
|
// Purge Phase 2 storage so it starts with a clean request queue
|
||||||
@ -428,9 +425,7 @@ async function scrapeProductDetails(
|
|||||||
const { name: listingName, partNumber: listingPn } = request.userData as {
|
const { name: listingName, partNumber: listingPn } = request.userData as {
|
||||||
name: string;
|
name: string;
|
||||||
partNumber: string;
|
partNumber: string;
|
||||||
targetTransceiverId?: string;
|
|
||||||
};
|
};
|
||||||
const { targetTransceiverId } = request.userData as { targetTransceiverId?: string };
|
|
||||||
const url = request.url;
|
const url = request.url;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
@ -737,7 +732,6 @@ async function scrapeProductDetails(
|
|||||||
specs: raw.specs,
|
specs: raw.specs,
|
||||||
imageUrl: resolveUrl(raw.imageUrl),
|
imageUrl: resolveUrl(raw.imageUrl),
|
||||||
datasheetUrl: resolveUrl(raw.datasheetUrl),
|
datasheetUrl: resolveUrl(raw.datasheetUrl),
|
||||||
targetTransceiverId,
|
|
||||||
});
|
});
|
||||||
},
|
},
|
||||||
}, makeCrawleeConfig("fs-phase2"));
|
}, makeCrawleeConfig("fs-phase2"));
|
||||||
@ -795,34 +789,11 @@ export async function scrapeFs(): Promise<void> {
|
|||||||
|
|
||||||
// ── Phase 1: Discover product URLs ─────────────────────────────────────────
|
// ── Phase 1: Discover product URLs ─────────────────────────────────────────
|
||||||
let productMap: Map<string, ProductSummary>;
|
let productMap: Map<string, ProductSummary>;
|
||||||
if (URL_DISCOVERY_ONLY) {
|
if (DB_DETAIL_ONLY) {
|
||||||
console.log("\n[Phase 1] URL discovery mode — probing FS.COM rows without product URLs…");
|
|
||||||
const dbRows = await pool.query(
|
|
||||||
`
|
|
||||||
SELECT t.id, t.part_number
|
|
||||||
FROM transceivers t
|
|
||||||
JOIN vendors v ON v.id = t.vendor_id
|
|
||||||
WHERE v.name = 'FS.COM'
|
|
||||||
AND COALESCE(t.product_page_url, '') = ''
|
|
||||||
AND t.part_number ~ '^FS-[0-9]+$'
|
|
||||||
ORDER BY t.part_number
|
|
||||||
LIMIT $1
|
|
||||||
`,
|
|
||||||
[MAX_DETAIL_PAGES_PER_RUN]
|
|
||||||
);
|
|
||||||
productMap = new Map(
|
|
||||||
dbRows.rows.map((row) => {
|
|
||||||
const partNumber = row.part_number as string;
|
|
||||||
const productId = partNumber.replace(/^FS-/, "");
|
|
||||||
const url = `${BASE_URL}/products/${productId}.html`;
|
|
||||||
return [url, { url, name: partNumber, partNumber, targetTransceiverId: row.id as string }];
|
|
||||||
})
|
|
||||||
);
|
|
||||||
} else if (DB_DETAIL_ONLY) {
|
|
||||||
console.log("\n[Phase 1] DB detail-only mode — using existing FS.COM product URLs with missing verification signals…");
|
console.log("\n[Phase 1] DB detail-only mode — using existing FS.COM product URLs with missing verification signals…");
|
||||||
const dbRows = await pool.query(
|
const dbRows = await pool.query(
|
||||||
`
|
`
|
||||||
SELECT t.id, t.part_number, t.product_page_url
|
SELECT t.part_number, t.product_page_url
|
||||||
FROM transceivers t
|
FROM transceivers t
|
||||||
JOIN vendors v ON v.id = t.vendor_id
|
JOIN vendors v ON v.id = t.vendor_id
|
||||||
WHERE v.name = 'FS.COM'
|
WHERE v.name = 'FS.COM'
|
||||||
@ -849,7 +820,7 @@ export async function scrapeFs(): Promise<void> {
|
|||||||
dbRows.rows.map((row) => {
|
dbRows.rows.map((row) => {
|
||||||
const url = normalizeFsProductUrl(row.product_page_url as string);
|
const url = normalizeFsProductUrl(row.product_page_url as string);
|
||||||
const partNumber = row.part_number as string;
|
const partNumber = row.part_number as string;
|
||||||
return [url, { url, name: partNumber, partNumber, targetTransceiverId: row.id as string }];
|
return [url, { url, name: partNumber, partNumber }];
|
||||||
})
|
})
|
||||||
);
|
);
|
||||||
} else {
|
} else {
|
||||||
@ -918,14 +889,7 @@ export async function scrapeFs(): Promise<void> {
|
|||||||
// ── Phase 2: Scrape detail pages ────────────────────────────────────────────
|
// ── Phase 2: Scrape detail pages ────────────────────────────────────────────
|
||||||
const detailRequests = urlsToScrape.map((url) => {
|
const detailRequests = urlsToScrape.map((url) => {
|
||||||
const s = productMap.get(url);
|
const s = productMap.get(url);
|
||||||
return {
|
return { url, userData: { name: s?.name ?? "FS.com Product", partNumber: s?.partNumber ?? "" } };
|
||||||
url,
|
|
||||||
userData: {
|
|
||||||
name: s?.name ?? "FS.com Product",
|
|
||||||
partNumber: s?.partNumber ?? "",
|
|
||||||
targetTransceiverId: s?.targetTransceiverId,
|
|
||||||
},
|
|
||||||
};
|
|
||||||
});
|
});
|
||||||
|
|
||||||
const details = await scrapeProductDetails(detailRequests, proxyConfiguration);
|
const details = await scrapeProductDetails(detailRequests, proxyConfiguration);
|
||||||
@ -947,7 +911,7 @@ export async function scrapeFs(): Promise<void> {
|
|||||||
const textForInference = `${detail.name} ${detail.partNumber} ${Object.values(detail.specs).join(" ")}`;
|
const textForInference = `${detail.name} ${detail.partNumber} ${Object.values(detail.specs).join(" ")}`;
|
||||||
const fiberType = parsed.fiberType ?? detectFiberType(textForInference);
|
const fiberType = parsed.fiberType ?? detectFiberType(textForInference);
|
||||||
|
|
||||||
const transceiverId = detail.targetTransceiverId ?? (await findOrCreateScrapedTransceiver({
|
const transceiverId = await findOrCreateScrapedTransceiver({
|
||||||
partNumber: detail.partNumber,
|
partNumber: detail.partNumber,
|
||||||
vendorId,
|
vendorId,
|
||||||
productUrl: detail.url,
|
productUrl: detail.url,
|
||||||
@ -960,37 +924,7 @@ export async function scrapeFs(): Promise<void> {
|
|||||||
wavelengths: parsed.wavelengths,
|
wavelengths: parsed.wavelengths,
|
||||||
imageUrl: detail.imageUrl,
|
imageUrl: detail.imageUrl,
|
||||||
category: "DataCenter",
|
category: "DataCenter",
|
||||||
}));
|
});
|
||||||
|
|
||||||
if (detail.targetTransceiverId) {
|
|
||||||
await pool.query(
|
|
||||||
`UPDATE transceivers
|
|
||||||
SET product_page_url = COALESCE(NULLIF(product_page_url, ''), $2),
|
|
||||||
form_factor = COALESCE(NULLIF(form_factor, ''), $3),
|
|
||||||
speed_gbps = CASE WHEN speed_gbps IS NULL OR speed_gbps = 0 THEN COALESCE($4, speed_gbps) ELSE speed_gbps END,
|
|
||||||
speed = COALESCE(NULLIF(speed, ''), $5),
|
|
||||||
reach_label = COALESCE(NULLIF(reach_label, ''), $6),
|
|
||||||
reach_meters = CASE WHEN reach_meters IS NULL OR reach_meters = 0 THEN COALESCE($7, reach_meters) ELSE reach_meters END,
|
|
||||||
fiber_type = COALESCE(NULLIF(fiber_type, ''), $8),
|
|
||||||
wavelengths = COALESCE(NULLIF(wavelengths, ''), $9),
|
|
||||||
updated_at = NOW()
|
|
||||||
WHERE id = $1`,
|
|
||||||
[
|
|
||||||
transceiverId,
|
|
||||||
detail.url,
|
|
||||||
ff,
|
|
||||||
speedInfo?.speedGbps ?? null,
|
|
||||||
speedInfo?.speed ?? null,
|
|
||||||
reach ?? parsed.reachLabel ?? null,
|
|
||||||
parsed.reachMeters ?? null,
|
|
||||||
fiberType ?? null,
|
|
||||||
parsed.wavelengths ?? null,
|
|
||||||
]
|
|
||||||
);
|
|
||||||
if (detail.imageUrl) {
|
|
||||||
await markImageVerified(transceiverId, detail.imageUrl);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const stockLevel = deriveStockLevel(detail.deQty, detail.globalQty, detail.backorderQty);
|
const stockLevel = deriveStockLevel(detail.deQty, detail.globalQty, detail.backorderQty);
|
||||||
const totalQty = (detail.deQty ?? 0) + (detail.globalQty ?? 0);
|
const totalQty = (detail.deQty ?? 0) + (detail.globalQty ?? 0);
|
||||||
@ -1036,18 +970,14 @@ export async function scrapeFs(): Promise<void> {
|
|||||||
});
|
});
|
||||||
if (stockNew) stockWritten++;
|
if (stockNew) stockWritten++;
|
||||||
|
|
||||||
const hasSourceDetails =
|
if (Object.keys(detail.specs).length > 0) {
|
||||||
Object.keys(detail.specs).length > 0 ||
|
|
||||||
Boolean(fiberType || parsed.connector || parsed.wavelengths || parsed.reachLabel || reach);
|
|
||||||
|
|
||||||
if (hasSourceDetails) {
|
|
||||||
const updated = await updateVerifiedSpecs({
|
const updated = await updateVerifiedSpecs({
|
||||||
transceiverId,
|
transceiverId,
|
||||||
fiberType,
|
fiberType,
|
||||||
connector: parsed.connector,
|
connector: parsed.connector,
|
||||||
wavelengths: parsed.wavelengths,
|
wavelengths: parsed.wavelengths,
|
||||||
reachMeters: parsed.reachMeters,
|
reachMeters: parsed.reachMeters,
|
||||||
reachLabel: reach ?? parsed.reachLabel,
|
reachLabel: parsed.reachLabel,
|
||||||
powerConsumptionW: parsed.powerConsumptionW,
|
powerConsumptionW: parsed.powerConsumptionW,
|
||||||
tempRange: parsed.tempRange,
|
tempRange: parsed.tempRange,
|
||||||
modulation: parsed.modulation,
|
modulation: parsed.modulation,
|
||||||
|
|||||||
@ -1,86 +1,9 @@
|
|||||||
# Current TIP Sync State
|
# Current TIP Sync State
|
||||||
|
|
||||||
Updated: 2026-05-09 11:59 UTC
|
Updated: 2026-05-09 09:18 UTC
|
||||||
|
|
||||||
## Newest Work
|
## Newest Work
|
||||||
|
|
||||||
- Priority Crawlee evaluation + FS.com URL discovery on 2026-05-09:
|
|
||||||
- operator asked whether these repos help:
|
|
||||||
- `https://github.com/apify/crawlee`
|
|
||||||
- `https://github.com/apify/crawlee-python`
|
|
||||||
- `https://github.com/hiteshchoudhary/crawlee-project`
|
|
||||||
- evaluation:
|
|
||||||
- `apify/crawlee` is directly relevant and already in use in TIP via TypeScript `PlaywrightCrawler`
|
|
||||||
- current TIP benefit is not adding Crawlee, but using Crawlee more deliberately:
|
|
||||||
- bounded RequestQueues
|
|
||||||
- stable `uniqueKey`
|
|
||||||
- explicit retry/no-text classes
|
|
||||||
- isolated storage directories
|
|
||||||
- AutoscaledPool telemetry as safety signal
|
|
||||||
- hard concurrency caps on Erik
|
|
||||||
- `apify/crawlee-python` is useful for future isolated Pi/Proxmox workers, especially for Python-native extraction experiments, but should not replace the current TypeScript scraper core today
|
|
||||||
- `hiteshchoudhary/crawlee-project` is a small community/demo project, useful as inspiration only; not a production dependency for TIP
|
|
||||||
- code improved:
|
|
||||||
- `packages/scraper/src/scrapers/fs-com.ts`
|
|
||||||
- added `FS_URL_DISCOVERY_ONLY=1`
|
|
||||||
- maps existing `FS-<numeric-id>` rows without `product_page_url` to `https://www.fs.com/de/products/<id>.html`
|
|
||||||
- carries `targetTransceiverId` through the crawler so verified source evidence updates the original row instead of creating duplicates
|
|
||||||
- marks current FS.com product images verified for target rows
|
|
||||||
- accepts deterministic H1/part/spec evidence for detail verification when FS.com does not expose a traditional spec table
|
|
||||||
- live runs on Erik:
|
|
||||||
- URL discovery pilot:
|
|
||||||
- target `20`
|
|
||||||
- scraped `19`
|
|
||||||
- failed `0`
|
|
||||||
- no-url rows dropped from `76` to `57`
|
|
||||||
- full URL discovery:
|
|
||||||
- target `56`
|
|
||||||
- scraped `55`
|
|
||||||
- failed `1` (`https://www.fs.com/de/products/229461.html`, transient `ERR_NETWORK_CHANGED`)
|
|
||||||
- no-url rows dropped to `2`
|
|
||||||
- DB reconciliation with improved detail evidence:
|
|
||||||
- target `57`
|
|
||||||
- scraped `55`
|
|
||||||
- failed `0`
|
|
||||||
- new prices `41`
|
|
||||||
- stock observations `40`
|
|
||||||
- specs verified `55`
|
|
||||||
- `pnpm -C packages/scraper build` passed on Erik after the code change
|
|
||||||
- FS.com final state after URL discovery:
|
|
||||||
- total rows: `383`
|
|
||||||
- price verified: `379`
|
|
||||||
- image verified: `374`
|
|
||||||
- details verified: `373`
|
|
||||||
- price+image+details: `373`
|
|
||||||
- fully verified: `205`
|
|
||||||
- missing URL: `2`
|
|
||||||
- missing image URL: `9`
|
|
||||||
- missing reach label: `4`
|
|
||||||
- missing fiber type: `9`
|
|
||||||
- HTML product-like rows:
|
|
||||||
- total `373`
|
|
||||||
- image `372`
|
|
||||||
- details `371`
|
|
||||||
- complete `371`
|
|
||||||
- no-url rows:
|
|
||||||
- `Change`
|
|
||||||
- `FS-229461`
|
|
||||||
- category rows: `4`
|
|
||||||
- TIP health after run:
|
|
||||||
- status `healthy`
|
|
||||||
- load status `ok`
|
|
||||||
- memory used `13%`
|
|
||||||
- global verified counters:
|
|
||||||
- price `11557`
|
|
||||||
- image `10711`
|
|
||||||
- details `9929`
|
|
||||||
- fully `8526`
|
|
||||||
- training pool:
|
|
||||||
- pushed `4d9a11c crawl: add fscom url discovery learning record`
|
|
||||||
- truth:
|
|
||||||
- FS.com is still not 100% complete
|
|
||||||
- honest current claim: `371/373` HTML product-like rows complete; remaining work is small and classifiable
|
|
||||||
|
|
||||||
- TIP FS.com / Fiberstore targeted verification push on 2026-05-09:
|
- TIP FS.com / Fiberstore targeted verification push on 2026-05-09:
|
||||||
- operator requested FS.com/Fiberstore next, with all crawler/scraper/robot learnings written to the TIPLLM training pool and no external AI
|
- operator requested FS.com/Fiberstore next, with all crawler/scraper/robot learnings written to the TIPLLM training pool and no external AI
|
||||||
- code improved:
|
- code improved:
|
||||||
|
|||||||
@ -1,118 +0,0 @@
|
|||||||
# Crawlee Evaluation and FS.com URL Discovery
|
|
||||||
|
|
||||||
Date: 2026-05-09
|
|
||||||
|
|
||||||
## Question
|
|
||||||
|
|
||||||
Operator asked with highest priority whether these repositories help TIP:
|
|
||||||
|
|
||||||
- `https://github.com/apify/crawlee`
|
|
||||||
- `https://github.com/apify/crawlee-python`
|
|
||||||
- `https://github.com/hiteshchoudhary/crawlee-project`
|
|
||||||
|
|
||||||
## Evaluation
|
|
||||||
|
|
||||||
`apify/crawlee` helps directly, but TIP already uses it in the TypeScript scraper stack. The priority is to harden our current usage rather than introduce a new crawler framework.
|
|
||||||
|
|
||||||
Best immediate Crawlee practices for TIP:
|
|
||||||
|
|
||||||
- keep per-vendor bounded runs
|
|
||||||
- use stable `uniqueKey`/target IDs so retries do not create duplicate rows
|
|
||||||
- keep Crawlee storage directories isolated per vendor/run class
|
|
||||||
- record no-text and max-retry URLs as a separate retry class
|
|
||||||
- use AutoscaledPool telemetry as a safety signal
|
|
||||||
- keep Erik at low concurrency and move heavier work to Pi/Proxmox workers
|
|
||||||
|
|
||||||
`apify/crawlee-python` is useful for future isolated worker experiments on Pi/Proxmox, especially where Python extraction libraries help. It should not replace the current TypeScript crawler core today.
|
|
||||||
|
|
||||||
`hiteshchoudhary/crawlee-project` is a small community/demo app, not a production building block for TIP.
|
|
||||||
|
|
||||||
## Code
|
|
||||||
|
|
||||||
Changed:
|
|
||||||
|
|
||||||
- `packages/scraper/src/scrapers/fs-com.ts`
|
|
||||||
|
|
||||||
Added:
|
|
||||||
|
|
||||||
- `FS_URL_DISCOVERY_ONLY=1`
|
|
||||||
- target row propagation with `targetTransceiverId`
|
|
||||||
- image verification for target rows
|
|
||||||
- H1/part/spec deterministic detail verification when FS.com lacks a spec table
|
|
||||||
|
|
||||||
## Live Runs
|
|
||||||
|
|
||||||
URL discovery pilot:
|
|
||||||
|
|
||||||
- target `20`
|
|
||||||
- scraped `19`
|
|
||||||
- failed `0`
|
|
||||||
- no-url rows: `76` -> `57`
|
|
||||||
|
|
||||||
Full URL discovery:
|
|
||||||
|
|
||||||
- target `56`
|
|
||||||
- scraped `55`
|
|
||||||
- failed `1`
|
|
||||||
- failed URL: `https://www.fs.com/de/products/229461.html`
|
|
||||||
- no-url rows: `57` -> `2`
|
|
||||||
|
|
||||||
DB reconciliation:
|
|
||||||
|
|
||||||
- target `57`
|
|
||||||
- scraped `55`
|
|
||||||
- failed `0`
|
|
||||||
- new prices `41`
|
|
||||||
- stock observations `40`
|
|
||||||
- specs verified `55`
|
|
||||||
|
|
||||||
Build:
|
|
||||||
|
|
||||||
- `pnpm -C packages/scraper build` passed on Erik
|
|
||||||
|
|
||||||
## FS.com Final State
|
|
||||||
|
|
||||||
- total rows: `383`
|
|
||||||
- price verified: `379`
|
|
||||||
- image verified: `374`
|
|
||||||
- details verified: `373`
|
|
||||||
- price+image+details: `373`
|
|
||||||
- fully verified: `205`
|
|
||||||
- missing URL: `2`
|
|
||||||
- missing image URL: `9`
|
|
||||||
- missing reach label: `4`
|
|
||||||
- missing fiber type: `9`
|
|
||||||
- HTML product-like rows: `373`
|
|
||||||
- HTML product-like complete: `371`
|
|
||||||
- no-url rows: `2`
|
|
||||||
- category rows: `4`
|
|
||||||
|
|
||||||
Remaining no-url rows:
|
|
||||||
|
|
||||||
- `Change`
|
|
||||||
- `FS-229461`
|
|
||||||
|
|
||||||
TIP health after run:
|
|
||||||
|
|
||||||
- status: `healthy`
|
|
||||||
- load status: `ok`
|
|
||||||
- memory used: `13%`
|
|
||||||
- global image verified: `10711`
|
|
||||||
- global details verified: `9929`
|
|
||||||
- global fully verified: `8526`
|
|
||||||
|
|
||||||
## Training Pool
|
|
||||||
|
|
||||||
Pushed:
|
|
||||||
|
|
||||||
- `4d9a11c crawl: add fscom url discovery learning record`
|
|
||||||
|
|
||||||
## Next
|
|
||||||
|
|
||||||
Do not claim FS.com is 100% complete yet. Remaining work:
|
|
||||||
|
|
||||||
- classify `Change`
|
|
||||||
- retry or classify `FS-229461`
|
|
||||||
- classify 4 category rows
|
|
||||||
- close 9 image/fiber gaps
|
|
||||||
- then move to next high-value competitor with the same bounded Crawlee pattern
|
|
||||||
Loading…
x
Reference in New Issue
Block a user