Compare commits
No commits in common. "7ddc439315e29ae32b9be7b5742b3675a9bf7b5e" and "e73defe6bbbc08515caeb03699574d3902cc82e8" have entirely different histories.
7ddc439315
...
e73defe6bb
@ -72,7 +72,6 @@ const MAX_DETAIL_PAGES_PER_RUN = parseInt(process.env["FS_MAX_DETAIL_PAGES_PER_R
|
|||||||
const STOCK_FRESH_HOURS = parseInt(process.env["FS_STOCK_FRESH_HOURS"] ?? "12", 10);
|
const STOCK_FRESH_HOURS = parseInt(process.env["FS_STOCK_FRESH_HOURS"] ?? "12", 10);
|
||||||
const FORCE_REVALIDATE = process.env["TIP_FORCE_REVALIDATE"] === "1";
|
const FORCE_REVALIDATE = process.env["TIP_FORCE_REVALIDATE"] === "1";
|
||||||
const ONLY_MISSING_IMAGES = process.env["FS_ONLY_MISSING_IMAGES"] === "1";
|
const ONLY_MISSING_IMAGES = process.env["FS_ONLY_MISSING_IMAGES"] === "1";
|
||||||
const DB_DETAIL_ONLY = process.env["FS_DB_DETAIL_ONLY"] === "1";
|
|
||||||
|
|
||||||
const PROXY_URLS = (process.env["PROXY_URLS"] ?? "")
|
const PROXY_URLS = (process.env["PROXY_URLS"] ?? "")
|
||||||
.split(",")
|
.split(",")
|
||||||
@ -242,18 +241,8 @@ function detectSpeed(text: string): { speed: string; speedGbps: number } | undef
|
|||||||
}
|
}
|
||||||
|
|
||||||
function detectReach(text: string): string | undefined {
|
function detectReach(text: string): string | undefined {
|
||||||
const m = text.match(/(\d{1,3}(?:,\d{3})+|\d+(?:\.\d+)?)\s*(m|km)\b/i);
|
const m = text.match(/(\d+)\s*(m|km)\b/i);
|
||||||
return m ? `${m[1].replace(/,/g, "")}${m[2].toLowerCase()}` : undefined;
|
return m ? `${m[1]}${m[2].toLowerCase()}` : undefined;
|
||||||
}
|
|
||||||
|
|
||||||
function detectFiberType(text: string): string | undefined {
|
|
||||||
if (/active\s+optical|\baoc\b/i.test(text)) return "AOC";
|
|
||||||
if (/copper|dac|twinax|direct\s+attach|rj-?45|base-t/i.test(text)) return "Copper";
|
|
||||||
if (/single.?mode|\bsmf\b|os2|cwdm|dwdm|\bcw-|^cw-|dw-|bidi|\blx\b|\blr\d*\b|\ber\d*\b|\bzr\d*\b|\bdr\d*\b|\bfr\d*\b|\bpsm\d*\b/i.test(text)) {
|
|
||||||
return "SMF";
|
|
||||||
}
|
|
||||||
if (/multi.?mode|\bmmf\b|om[1-5]|\bsx\b|\bsr\d*\b/i.test(text)) return "MMF";
|
|
||||||
return undefined;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Types ──────────────────────────────────────────────────────────────────────
|
// ── Types ──────────────────────────────────────────────────────────────────────
|
||||||
@ -788,45 +777,8 @@ export async function scrapeFs(): Promise<void> {
|
|||||||
console.log(`Vendor ID: ${vendorId}`);
|
console.log(`Vendor ID: ${vendorId}`);
|
||||||
|
|
||||||
// ── Phase 1: Discover product URLs ─────────────────────────────────────────
|
// ── Phase 1: Discover product URLs ─────────────────────────────────────────
|
||||||
let productMap: Map<string, ProductSummary>;
|
|
||||||
if (DB_DETAIL_ONLY) {
|
|
||||||
console.log("\n[Phase 1] DB detail-only mode — using existing FS.COM product URLs with missing verification signals…");
|
|
||||||
const dbRows = await pool.query(
|
|
||||||
`
|
|
||||||
SELECT t.part_number, t.product_page_url
|
|
||||||
FROM transceivers t
|
|
||||||
JOIN vendors v ON v.id = t.vendor_id
|
|
||||||
WHERE v.name = 'FS.COM'
|
|
||||||
AND t.product_page_url IS NOT NULL
|
|
||||||
AND t.product_page_url != ''
|
|
||||||
AND t.product_page_url LIKE '%/products/%'
|
|
||||||
AND (
|
|
||||||
COALESCE(t.price_verified, false) = false
|
|
||||||
OR COALESCE(t.image_verified, false) = false
|
|
||||||
OR COALESCE(t.details_verified, false) = false
|
|
||||||
OR COALESCE(t.fiber_type, '') = ''
|
|
||||||
OR COALESCE(t.reach_label, '') = ''
|
|
||||||
)
|
|
||||||
ORDER BY
|
|
||||||
COALESCE(t.price_verified, false) DESC,
|
|
||||||
COALESCE(t.image_verified, false) DESC,
|
|
||||||
COALESCE(t.details_verified, false) ASC,
|
|
||||||
t.part_number
|
|
||||||
LIMIT $1
|
|
||||||
`,
|
|
||||||
[MAX_DETAIL_PAGES_PER_RUN]
|
|
||||||
);
|
|
||||||
productMap = new Map(
|
|
||||||
dbRows.rows.map((row) => {
|
|
||||||
const url = normalizeFsProductUrl(row.product_page_url as string);
|
|
||||||
const partNumber = row.part_number as string;
|
|
||||||
return [url, { url, name: partNumber, partNumber }];
|
|
||||||
})
|
|
||||||
);
|
|
||||||
} else {
|
|
||||||
console.log("\n[Phase 1] Collecting product URLs from category listing pages…");
|
console.log("\n[Phase 1] Collecting product URLs from category listing pages…");
|
||||||
productMap = await collectProductUrls(proxyConfiguration);
|
const productMap = await collectProductUrls(proxyConfiguration);
|
||||||
}
|
|
||||||
|
|
||||||
if (productMap.size === 0) {
|
if (productMap.size === 0) {
|
||||||
console.warn("[Phase 1] No products discovered — check selectors or proxy.");
|
console.warn("[Phase 1] No products discovered — check selectors or proxy.");
|
||||||
@ -908,19 +860,16 @@ export async function scrapeFs(): Promise<void> {
|
|||||||
const speedInfo = detectSpeed(detail.name);
|
const speedInfo = detectSpeed(detail.name);
|
||||||
const reach = detectReach(detail.name);
|
const reach = detectReach(detail.name);
|
||||||
const parsed = parseSpecTable(detail.specs);
|
const parsed = parseSpecTable(detail.specs);
|
||||||
const textForInference = `${detail.name} ${detail.partNumber} ${Object.values(detail.specs).join(" ")}`;
|
|
||||||
const fiberType = parsed.fiberType ?? detectFiberType(textForInference);
|
|
||||||
|
|
||||||
const transceiverId = await findOrCreateScrapedTransceiver({
|
const transceiverId = await findOrCreateScrapedTransceiver({
|
||||||
partNumber: detail.partNumber,
|
partNumber: detail.partNumber,
|
||||||
vendorId,
|
vendorId,
|
||||||
productUrl: detail.url,
|
|
||||||
formFactor: ff,
|
formFactor: ff,
|
||||||
speedGbps: speedInfo?.speedGbps,
|
speedGbps: speedInfo?.speedGbps,
|
||||||
speed: speedInfo?.speed,
|
speed: speedInfo?.speed,
|
||||||
reachLabel: reach ?? parsed.reachLabel,
|
reachLabel: reach ?? parsed.reachLabel,
|
||||||
reachMeters: parsed.reachMeters,
|
reachMeters: parsed.reachMeters,
|
||||||
fiberType,
|
fiberType: parsed.fiberType,
|
||||||
wavelengths: parsed.wavelengths,
|
wavelengths: parsed.wavelengths,
|
||||||
imageUrl: detail.imageUrl,
|
imageUrl: detail.imageUrl,
|
||||||
category: "DataCenter",
|
category: "DataCenter",
|
||||||
@ -973,7 +922,7 @@ export async function scrapeFs(): Promise<void> {
|
|||||||
if (Object.keys(detail.specs).length > 0) {
|
if (Object.keys(detail.specs).length > 0) {
|
||||||
const updated = await updateVerifiedSpecs({
|
const updated = await updateVerifiedSpecs({
|
||||||
transceiverId,
|
transceiverId,
|
||||||
fiberType,
|
fiberType: parsed.fiberType,
|
||||||
connector: parsed.connector,
|
connector: parsed.connector,
|
||||||
wavelengths: parsed.wavelengths,
|
wavelengths: parsed.wavelengths,
|
||||||
reachMeters: parsed.reachMeters,
|
reachMeters: parsed.reachMeters,
|
||||||
@ -984,7 +933,7 @@ export async function scrapeFs(): Promise<void> {
|
|||||||
domSupport: parsed.domSupport,
|
domSupport: parsed.domSupport,
|
||||||
imageUrl: detail.imageUrl,
|
imageUrl: detail.imageUrl,
|
||||||
datasheetUrl: detail.datasheetUrl,
|
datasheetUrl: detail.datasheetUrl,
|
||||||
source: detail.url,
|
source: "fs.com",
|
||||||
});
|
});
|
||||||
if (updated) specsUpdated++;
|
if (updated) specsUpdated++;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,96 +1,9 @@
|
|||||||
# Current TIP Sync State
|
# Current TIP Sync State
|
||||||
|
|
||||||
Updated: 2026-05-09 09:18 UTC
|
Updated: 2026-05-09 07:34 UTC
|
||||||
|
|
||||||
## Newest Work
|
## Newest Work
|
||||||
|
|
||||||
- TIP FS.com / Fiberstore targeted verification push on 2026-05-09:
|
|
||||||
- operator requested FS.com/Fiberstore next, with all crawler/scraper/robot learnings written to the TIPLLM training pool and no external AI
|
|
||||||
- code improved:
|
|
||||||
- `packages/scraper/src/scrapers/fs-com.ts`
|
|
||||||
- added `FS_DB_DETAIL_ONLY=1` mode to revalidate existing FS.COM product URLs directly from DB
|
|
||||||
- avoids broad category/listing discovery while product URLs still need verification
|
|
||||||
- `detectReach()` now handles comma thousands and decimal values
|
|
||||||
- added deterministic `detectFiberType()` fallback from product name, part number and specs
|
|
||||||
- scraper now writes `productUrl` into the transceiver row
|
|
||||||
- detail verification source is now the actual FS.com product URL instead of the literal `fs.com`
|
|
||||||
- live Erik verification:
|
|
||||||
- deployed scraper to `/opt/tip`
|
|
||||||
- `pnpm -C packages/scraper build` passed on Erik after the change
|
|
||||||
- ran four safe DB-detail-only Playwright batches:
|
|
||||||
- batch 1: target `80`, scraped `80`, failed `0`, new prices `17`, stock `18`, specs `24`
|
|
||||||
- batch 2: target `80`, scraped `79`, failed `0`, new prices `6`, stock `8`, specs `23`
|
|
||||||
- batch 3: target `90`, scraped `89`, failed `0`, new prices `21`, stock `24`, specs `47`
|
|
||||||
- batch 4 closure: target `42`, scraped `42`, failed `0`, new prices `5`, stock `3`, specs `25`
|
|
||||||
- all runs used Playwright concurrency `1`, `nice -n 10`, and no broad category crawl
|
|
||||||
- Erik/TIP health after closure:
|
|
||||||
- status: `healthy`
|
|
||||||
- load status: `ok`
|
|
||||||
- memory used: `13%`
|
|
||||||
- transceivers: `17647`
|
|
||||||
- vendors: `478`
|
|
||||||
- switches: `680`
|
|
||||||
- global verified counters:
|
|
||||||
- price: `11557`
|
|
||||||
- image: `10636`
|
|
||||||
- details: `9816`
|
|
||||||
- fully: `8522`
|
|
||||||
- FS.com before targeted detail batches:
|
|
||||||
- total rows: `383`
|
|
||||||
- price verified: `379`
|
|
||||||
- image verified: `299`
|
|
||||||
- details verified: `108`
|
|
||||||
- price+image+details: `108`
|
|
||||||
- fully verified: `3`
|
|
||||||
- missing product URL: `76`
|
|
||||||
- missing image URL: `84`
|
|
||||||
- missing reach label: `9`
|
|
||||||
- missing fiber type: `323`
|
|
||||||
- HTML product-like complete rows: `106`
|
|
||||||
- FS.com after closure:
|
|
||||||
- total rows: `383`
|
|
||||||
- price verified: `379`
|
|
||||||
- image verified: `299`
|
|
||||||
- details verified: `260`
|
|
||||||
- price+image+details: `260`
|
|
||||||
- fully verified: `205`
|
|
||||||
- missing product URL: `76`
|
|
||||||
- missing image URL: `84`
|
|
||||||
- missing reach label: `9`
|
|
||||||
- missing fiber type: `123`
|
|
||||||
- HTML product-like rows:
|
|
||||||
- total `299`
|
|
||||||
- price `299`
|
|
||||||
- image `282`
|
|
||||||
- details `258`
|
|
||||||
- complete `258`
|
|
||||||
- no-url rows:
|
|
||||||
- total `76`
|
|
||||||
- price `76`
|
|
||||||
- image `15`
|
|
||||||
- details `0`
|
|
||||||
- category rows:
|
|
||||||
- total `4`
|
|
||||||
- no verified signals
|
|
||||||
- interpretation / next strategy:
|
|
||||||
- the DB-detail-only approach is now mostly exhausted
|
|
||||||
- the fourth clean closure batch did not raise `details_verified`; it only nudged `fully_verified` from `199` to `205`
|
|
||||||
- do not keep repeating the same FS.com detail crawler on Erik
|
|
||||||
- next FS.com work should be:
|
|
||||||
- source-discovery/classification robot for the `76` no-url rows
|
|
||||||
- parser/source diagnostics for the remaining `41` HTML product-like rows missing detail/fiber/image signals
|
|
||||||
- likely separate handling for malformed or historical `/de/de/products/...` URLs and pages that return no useful text
|
|
||||||
- TIPLLM training pool:
|
|
||||||
- all four FS.com batches were written and pushed to Gitea
|
|
||||||
- latest training commits:
|
|
||||||
- `28cac05` batch 1
|
|
||||||
- `a0a6be3` batch 2
|
|
||||||
- `38736ae` batch 3
|
|
||||||
- `2c25bf3` closure batch
|
|
||||||
- important truth:
|
|
||||||
- do not claim FS.com is complete
|
|
||||||
- the honest current claim is: FS.com product-like coverage improved strongly, but `258/299` HTML product-like rows are complete and `76` no-url rows still need source discovery/classification
|
|
||||||
|
|
||||||
- TIP Flexoptix completion push on 2026-05-09:
|
- TIP Flexoptix completion push on 2026-05-09:
|
||||||
- operator said "feuer frei" after confirming Flexoptix was not yet complete
|
- operator said "feuer frei" after confirming Flexoptix was not yet complete
|
||||||
- TIPLLM training pool was updated immediately with the truth rule:
|
- TIPLLM training pool was updated immediately with the truth rule:
|
||||||
|
|||||||
@ -1,101 +0,0 @@
|
|||||||
# FS.com / Fiberstore Targeted Verification Push
|
|
||||||
|
|
||||||
Date: 2026-05-09
|
|
||||||
|
|
||||||
## Intent
|
|
||||||
|
|
||||||
Continue TIP data completion for FS.com/Fiberstore after Flexoptix. The operator requested price, image and product information to be researched deeply enough to avoid manual validation, while keeping Erik safe and writing every crawler/scraper/robot learning into the TIPLLM training pool.
|
|
||||||
|
|
||||||
## Code Changed
|
|
||||||
|
|
||||||
- `packages/scraper/src/scrapers/fs-com.ts`
|
|
||||||
- added `FS_DB_DETAIL_ONLY=1`
|
|
||||||
- targets existing FS.COM DB product URLs with missing verification signals
|
|
||||||
- avoids broad category discovery while known product URLs still need work
|
|
||||||
- improved reach parsing for comma/decimal values
|
|
||||||
- added deterministic fiber type fallback from product name, part number and specs
|
|
||||||
- writes product URL to `transceivers.product_page_url`
|
|
||||||
- stores the real FS.com product URL as detail verification source
|
|
||||||
|
|
||||||
## Live Runs
|
|
||||||
|
|
||||||
All runs were on Erik with:
|
|
||||||
|
|
||||||
- Playwright concurrency `1`
|
|
||||||
- `nice -n 10`
|
|
||||||
- no broad category crawl
|
|
||||||
- DB-detail-only mode
|
|
||||||
|
|
||||||
Batch results:
|
|
||||||
|
|
||||||
- Batch 1: target `80`, scraped `80`, failed `0`, new prices `17`, stock `18`, specs `24`
|
|
||||||
- Batch 2: target `80`, scraped `79`, failed `0`, new prices `6`, stock `8`, specs `23`
|
|
||||||
- Batch 3: target `90`, scraped `89`, failed `0`, new prices `21`, stock `24`, specs `47`
|
|
||||||
- Batch 4 closure: target `42`, scraped `42`, failed `0`, new prices `5`, stock `3`, specs `25`
|
|
||||||
|
|
||||||
`pnpm -C packages/scraper build` passed on Erik after the scraper change.
|
|
||||||
|
|
||||||
## FS.com Counters
|
|
||||||
|
|
||||||
Before:
|
|
||||||
|
|
||||||
- total rows: `383`
|
|
||||||
- price verified: `379`
|
|
||||||
- image verified: `299`
|
|
||||||
- details verified: `108`
|
|
||||||
- price+image+details: `108`
|
|
||||||
- fully verified: `3`
|
|
||||||
- missing URL: `76`
|
|
||||||
- missing image URL: `84`
|
|
||||||
- missing reach label: `9`
|
|
||||||
- missing fiber type: `323`
|
|
||||||
- HTML product-like complete: `106`
|
|
||||||
|
|
||||||
After closure:
|
|
||||||
|
|
||||||
- total rows: `383`
|
|
||||||
- price verified: `379`
|
|
||||||
- image verified: `299`
|
|
||||||
- details verified: `260`
|
|
||||||
- price+image+details: `260`
|
|
||||||
- fully verified: `205`
|
|
||||||
- missing URL: `76`
|
|
||||||
- missing image URL: `84`
|
|
||||||
- missing reach label: `9`
|
|
||||||
- missing fiber type: `123`
|
|
||||||
- HTML product-like rows: `299`
|
|
||||||
- HTML product-like complete: `258`
|
|
||||||
- no-url rows: `76`
|
|
||||||
- category rows: `4`
|
|
||||||
|
|
||||||
TIP health after closure:
|
|
||||||
|
|
||||||
- status: `healthy`
|
|
||||||
- load status: `ok`
|
|
||||||
- memory used: `13%`
|
|
||||||
- transceivers: `17647`
|
|
||||||
- vendors: `478`
|
|
||||||
- switches: `680`
|
|
||||||
- fully verified globally: `8522`
|
|
||||||
|
|
||||||
## Training Pool
|
|
||||||
|
|
||||||
FS.com batches were written to `/tmp/tip-training-data` and pushed to Gitea.
|
|
||||||
|
|
||||||
Training pool commits:
|
|
||||||
|
|
||||||
- `28cac05 crawl: add fscom db detail batch learning record`
|
|
||||||
- `a0a6be3 crawl: add fscom db detail batch 2 learning record`
|
|
||||||
- `38736ae crawl: add fscom db detail batch 3 learning record`
|
|
||||||
- `2c25bf3 crawl: add fscom db detail closure learning record`
|
|
||||||
|
|
||||||
## Next
|
|
||||||
|
|
||||||
Do not repeat the same DB-detail-only FS.com crawler on Erik. The fourth clean closure batch did not increase `details_verified`, so the remaining gaps need a different strategy:
|
|
||||||
|
|
||||||
- source-discovery/classification for `76` no-url rows
|
|
||||||
- parser/source diagnostics for the remaining `41` HTML product-like rows missing details or fiber/image signals
|
|
||||||
- explicit classification for `4` category rows
|
|
||||||
- likely cleanup of historical/malformed `/de/de/products/...` URLs and no-text pages
|
|
||||||
|
|
||||||
Truth rule: do not claim FS.com is complete. Current honest status is `258/299` HTML product-like rows complete and `205/383` fully verified overall.
|
|
||||||
Loading…
x
Reference in New Issue
Block a user