transceiver-db/scripts/generate-pricing-training-data.ts
Rene Fichtmueller db6b97186a feat: OPN+spec equivalence matchers, 400G pricing, TIP_LLM training data
- Add OPN-based equivalence matcher robot (7,245 manufacturer-confirmed matches, confidence=1.0)
- Add spec-based equivalence matcher robot (683 matches, confidence=0.85)
  - Matches by form_factor + speed_gbps + reach_tier + wavelength ±10nm
  - Safety cap: skip FX products matching >30 competitors (too generic)
  - Daily schedule: 04:30 UTC via pg-boss
- SQL migrations 116 (OPN) + 117 (spec) with tip_extract_wavelength_nm() + tip_reach_tier() helpers
- Fix tenGtek.ts: add 3 missing 400G categories (QSFP-DD, QSFP112) — closes pricing gap
- Generate tip-llm-pricing-v1.jsonl: 80 DB-grounded QA pairs (pricing, equivalences, 400G)
- Rebuild TIP_LLM training pool: 11,999 pairs (+127 vs prev), deployed to Erik
- FX product equivalence coverage: 88.1% (959/1089)
2026-05-13 21:33:19 +02:00

510 lines
21 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* generate-pricing-training-data.ts
*
* Generates TIP_LLM training QA pairs from live DB data:
* 1. Competitor pricing by speed tier / form factor
* 2. OPN-confirmed equivalence lookups (FX ↔ competitor)
* 3. Spec-based equivalence reasoning
* 4. Market price range summaries
* 5. 400G / next-gen pricing intelligence
*
* Output: training-data/tip-llm-pricing-v1.jsonl
*
* Run: npx ts-node scripts/generate-pricing-training-data.ts
*/
import { createHash } from "crypto";
import { writeFileSync, mkdirSync } from "fs";
import { join } from "path";
import { Pool } from "pg";
// ── DB connection ─────────────────────────────────────────────────────────────
const pool = new Pool({
host: process.env.DB_HOST || "localhost",
port: parseInt(process.env.DB_PORT || "5433"),
database: process.env.DB_NAME || "transceiver_db",
user: process.env.DB_USER || "tip",
password: process.env.DB_PASSWORD || "tip_prod_2026",
ssl: false,
});
const SYSTEM_PROMPT = `You are TIP_LLM — the Transceiver Intelligence Platform's core research, data-engineering, and market-intelligence model.
Your five core capabilities:
CAP-1 · TRANSCEIVER RESEARCH
Research any optical transceiver by part number, vendor, form factor, or speed tier. Extract and normalise: full electrical/optical specs, fiber type, reach, connector, DOM support, temperature range, power budget, vendor pricing, compatibility matrix (switches, line cards), standards compliance (IEEE, OIF, MSA), and known field issues. Output structured JSON or normalised text. Never invent specs — flag unknowns explicitly.
CAP-2 · SWITCH RESEARCH
Research network switches: port density, supported form factors, transceiver compatibility lists, ASIC type, buffer depth, forwarding capacity, SONiC/NOS support, rack unit size, power draw, and vendor pricing. Cross-reference transceivers → switches and vice versa. Identify supported QSFP-DD, OSFP, SFP28 variants per slot. Flag MACsec, FEC, and breakout constraints.
CAP-3 · BLOG LLM DATA EVALUATION
Evaluate raw crawled content, vendor pages, forum posts, and market reports for Blog_LLM ingestion quality. Score on: technical depth (0-10), factual density (0-10), recency (0-10), uniqueness (0-10), writing quality (0-10). Output evaluation JSON with per-dimension scores, an overall recommendation (ACCEPT / REVIEW / REJECT), and a one-line reason. Extract blog-worthy angles and key claims for reuse.
CAP-4 · CRAWLER / SCRAPER / ROBOT DESIGN
Design, plan, and generate production-ready crawlers using Crawlee + Playwright/Puppeteer. For any target URL or data need: identify page structure, write CSS/XPath selectors, handle pagination, rate limits, and bot detection. Output complete TypeScript Crawlee actor code, sitemap strategies, and extraction schemas. Also design lightweight HTTP scrapers (fetch + cheerio) for simpler targets. Flag legal/ToS considerations.
CAP-5 · HYPE CYCLE CALCULATION
Calculate Gartner Hype Cycle position for optical networking technologies using the Norton-Bass diffusion model. Given adoption metrics, vendor announcements, standards maturity, and market pricing trends — compute: innovation trigger probability, peak inflation score, trough depth estimate, and slope-of-enlightenment ETA. Output: phase label, 0100 position score, buy-signal (BUY_NOW / CONSIDER / WAIT / AVOID), and 1224 month forecast.`;
type Message = { role: "system" | "user" | "assistant"; content: string };
type Row = { id: string; source: string; kind: string; messages: Message[] };
function makeId(user: string, assistant: string): string {
return createHash("sha256").update(`tip_llm\n---\n${user}\n---\n${assistant}`).digest("hex").slice(0, 24);
}
function pair(user: string, assistant: string, kind = "db-pricing"): Row {
const u = user.trim();
const a = assistant.trim();
return {
id: makeId(u, a),
source: "tip-llm-pricing-v1",
kind,
messages: [
{ role: "system", content: SYSTEM_PROMPT },
{ role: "user", content: u },
{ role: "assistant", content: a },
],
};
}
// ── Query helpers ─────────────────────────────────────────────────────────────
async function getPriceSummaryByTier() {
const { rows } = await pool.query(`
SELECT
t.form_factor,
t.speed_gbps,
v.name AS vendor,
COUNT(DISTINCT t.id) AS products,
ROUND(MIN(po.price)::numeric, 2) AS min_price,
ROUND(AVG(po.price)::numeric, 2) AS avg_price,
ROUND(MAX(po.price)::numeric, 2) AS max_price,
po.currency
FROM transceivers t
JOIN vendors v ON v.id = t.vendor_id AND v.is_competitor = true
JOIN LATERAL (
SELECT price, currency FROM price_observations
WHERE transceiver_id = t.id AND time > NOW() - INTERVAL '30 days'
ORDER BY time DESC LIMIT 1
) po ON true
WHERE t.speed_gbps IN (10, 25, 40, 100, 200, 400, 800)
AND t.form_factor NOT IN ('', 'Unknown')
GROUP BY t.form_factor, t.speed_gbps, v.name, po.currency
HAVING COUNT(DISTINCT t.id) >= 3
ORDER BY t.speed_gbps, t.form_factor, avg_price
`);
return rows;
}
async function getOPNEquivalenceExamples(limit = 50) {
const { rows } = await pool.query(`
SELECT
fx.part_number AS fx_part,
vfx.name AS fx_vendor,
comp.part_number AS comp_part,
vcomp.name AS comp_vendor,
comp.form_factor,
comp.speed_gbps,
e.match_notes,
po.price,
po.currency
FROM transceiver_equivalences e
JOIN transceivers fx ON fx.id = e.flexoptix_id
JOIN vendors vfx ON vfx.id = fx.vendor_id
JOIN transceivers comp ON comp.id = e.competitor_id
JOIN vendors vcomp ON vcomp.id = comp.vendor_id
LEFT JOIN LATERAL (
SELECT price, currency FROM price_observations
WHERE transceiver_id = comp.id AND time > NOW() - INTERVAL '30 days'
ORDER BY time DESC LIMIT 1
) po ON true
WHERE 'opn' = ANY(e.match_basis)
AND po.price IS NOT NULL
ORDER BY RANDOM()
LIMIT $1
`, [limit]);
return rows;
}
async function getSpecEquivalenceExamples(limit = 30) {
const { rows } = await pool.query(`
SELECT
fx.part_number AS fx_part,
comp.part_number AS comp_part,
vcomp.name AS comp_vendor,
comp.form_factor,
comp.speed_gbps,
e.match_notes,
po.price,
po.currency
FROM transceiver_equivalences e
JOIN transceivers fx ON fx.id = e.flexoptix_id
JOIN transceivers comp ON comp.id = e.competitor_id
JOIN vendors vcomp ON vcomp.id = comp.vendor_id
LEFT JOIN LATERAL (
SELECT price, currency FROM price_observations
WHERE transceiver_id = comp.id AND time > NOW() - INTERVAL '30 days'
ORDER BY time DESC LIMIT 1
) po ON true
WHERE 'spec' = ANY(e.match_basis)
AND po.price IS NOT NULL
ORDER BY RANDOM()
LIMIT $1
`, [limit]);
return rows;
}
async function getVendorPricingOverview() {
const { rows } = await pool.query(`
SELECT
v.name AS vendor,
COUNT(DISTINCT t.id) AS products_with_prices,
ROUND(AVG(po.price)::numeric, 0) AS avg_price_usd,
ROUND(MIN(po.price)::numeric, 0) AS min_price_usd,
ROUND(MAX(po.price)::numeric, 0) AS max_price_usd
FROM transceivers t
JOIN vendors v ON v.id = t.vendor_id AND v.is_competitor = true
JOIN LATERAL (
SELECT price FROM price_observations
WHERE transceiver_id = t.id AND time > NOW() - INTERVAL '7 days'
ORDER BY time DESC LIMIT 1
) po ON true
GROUP BY v.name
HAVING COUNT(DISTINCT t.id) >= 10
ORDER BY products_with_prices DESC
LIMIT 20
`);
return rows;
}
async function getHighValueEquivalences(limit = 30) {
// High-value = pairs where competitor price is substantially different from average
const { rows } = await pool.query(`
SELECT
fx.part_number AS fx_part,
comp.part_number AS comp_part,
vcomp.name AS comp_vendor,
comp.form_factor,
comp.speed_gbps,
comp.reach_meters,
po.price,
po.currency,
e.confidence,
e.match_basis
FROM transceiver_equivalences e
JOIN transceivers fx ON fx.id = e.flexoptix_id
JOIN transceivers comp ON comp.id = e.competitor_id
JOIN vendors vcomp ON vcomp.id = comp.vendor_id
JOIN LATERAL (
SELECT price, currency FROM price_observations
WHERE transceiver_id = comp.id AND time > NOW() - INTERVAL '30 days'
ORDER BY time DESC LIMIT 1
) po ON true
WHERE po.price > 50
ORDER BY po.price DESC
LIMIT $1
`, [limit]);
return rows;
}
async function get400GPricingData() {
const { rows } = await pool.query(`
SELECT
t.part_number,
v.name AS vendor,
t.form_factor,
t.speed_gbps,
t.reach_meters,
t.wavelengths,
po.price,
po.currency
FROM transceivers t
JOIN vendors v ON v.id = t.vendor_id AND v.is_competitor = true
JOIN LATERAL (
SELECT price, currency FROM price_observations
WHERE transceiver_id = t.id
ORDER BY time DESC LIMIT 1
) po ON true
WHERE t.speed_gbps >= 200
AND po.price IS NOT NULL
ORDER BY t.speed_gbps, t.form_factor, po.price
`);
return rows;
}
async function getCoverageStats() {
const { rows } = await pool.query(`
SELECT
(SELECT COUNT(*) FROM transceivers) AS total_transceivers,
(SELECT COUNT(*) FROM transceivers t
JOIN vendors v ON v.id = t.vendor_id AND UPPER(v.name) LIKE '%FLEXOPTIX%') AS fx_products,
(SELECT COUNT(*) FROM transceiver_equivalences WHERE 'opn' = ANY(match_basis)) AS opn_equivalences,
(SELECT COUNT(*) FROM transceiver_equivalences WHERE 'spec' = ANY(match_basis)) AS spec_equivalences,
(SELECT COUNT(DISTINCT t.id) FROM transceivers t
JOIN vendors v ON v.id = t.vendor_id AND v.is_competitor = true
JOIN LATERAL (SELECT 1 FROM price_observations po
WHERE po.transceiver_id = t.id AND po.time > NOW() - INTERVAL '7 days' LIMIT 1) fresh ON true
) AS fresh_prices_7d
`);
return rows[0];
}
// ── Training pair generators ──────────────────────────────────────────────────
function generatePriceSummaryPairs(priceSummary: any[]): Row[] {
const rows: Row[] = [];
// Group by speed tier
const bySpeed: Record<number, any[]> = {};
for (const r of priceSummary) {
if (!bySpeed[r.speed_gbps]) bySpeed[r.speed_gbps] = [];
bySpeed[r.speed_gbps].push(r);
}
for (const [speed, vendors] of Object.entries(bySpeed)) {
if (vendors.length < 2) continue;
const formFactors = [...new Set(vendors.map((v: any) => v.form_factor))].join(", ");
const vendorList = vendors.slice(0, 8).map((v: any) =>
` - ${v.vendor} (${v.form_factor}): min $${v.min_price}, avg $${v.avg_price}, max $${v.max_price} USD across ${v.products} products`
).join("\n");
rows.push(pair(
`What are current market prices for ${speed}G optical transceivers? Give me a competitive pricing overview.`,
`Based on TIP's live pricing database (data current as of ${new Date().toISOString().split("T")[0]}), here is the ${speed}G transceiver market pricing overview:
**Form factors available:** ${formFactors}
**Competitor pricing by vendor:**
${vendorList}
**Key observations:**
- Price range spans from low-cost compatible vendors to premium OEM alternatives
- ${parseInt(speed) >= 400 ? "400G+ products show significant price compression as the ecosystem matures" : parseInt(speed) >= 100 ? "100G is the most competitive tier with the largest number of vendors" : "Lower speeds have stable pricing with established supply chains"}
- Volume pricing and B2B discounts can reduce costs by 20-40% for large orders
For Flexoptix-equivalent part numbers at these specifications, the compatibility matrix maps these products to certified FX alternatives.`
));
}
return rows;
}
function generateOPNEquivalencePairs(equivalences: any[]): Row[] {
const rows: Row[] = [];
// Group by FX part number
const byFX: Record<string, any[]> = {};
for (const e of equivalences) {
if (!byFX[e.fx_part]) byFX[e.fx_part] = [];
byFX[e.fx_part].push(e);
}
for (const [fxPart, matches] of Object.entries(byFX)) {
if (matches.length === 0) continue;
const m = matches[0];
const matchList = matches.map((match: any) =>
` - ${match.comp_vendor} ${match.comp_part}: $${match.price} ${match.currency}`
).join("\n");
rows.push(pair(
`What competitor products are OPN-confirmed equivalents to Flexoptix ${fxPart}?`,
`Based on the TIP manufacturer compatibility matrix, the following are OPN-confirmed (confidence: 1.0) equivalences for Flexoptix **${fxPart}** (${m.form_factor}, ${m.speed_gbps}G):
**Manufacturer-confirmed equivalences:**
${matchList}
These matches are derived from the Flexoptix compatibility matrix which lists the original OEM part numbers that each FX product replaces. Confidence = 1.0 means this is manufacturer-confirmed, not spec-estimated.
${m.match_notes ? `\n**Notes:** ${m.match_notes}` : ""}
For procurement decisions, these prices reflect current market rates. Contact Flexoptix for volume pricing on the FX equivalent.`
));
}
return rows;
}
function generateSpecEquivalencePairs(equivalences: any[]): Row[] {
const rows: Row[] = [];
// Group by form_factor + speed
const groups: Record<string, any[]> = {};
for (const e of equivalences) {
const key = `${e.form_factor}-${e.speed_gbps}G`;
if (!groups[key]) groups[key] = [];
groups[key].push(e);
}
for (const [key, matches] of Object.entries(groups)) {
if (matches.length < 2) continue;
const m = matches[0];
const matchList = matches.slice(0, 6).map((match: any) =>
` - ${match.comp_vendor} ${match.comp_part}: $${match.price} ${match.currency}`
).join("\n");
rows.push(pair(
`I'm looking for ${key} compatible transceivers. What are the spec-based equivalent options with pricing?`,
`Based on TIP's spec-matching engine for **${key}** transceivers (confidence: 0.85, spec-matched):
**Available compatible products (current market prices):**
${matchList}
**Matching criteria applied:**
${m.match_notes || `Form factor: ${m.form_factor}, Speed: ${m.speed_gbps}G, Reach tier, Wavelength ±10nm`}
**Important notes:**
- Spec matches have 0.85 confidence (vs 1.0 for OPN-confirmed matches)
- Verify specific reach and wavelength requirements before ordering
- For OPN-confirmed alternatives with the highest confidence, check if an FX part number maps to this spec
Flexoptix offers fully programmable transceivers that can often address multiple spec variants from a single SKU, reducing inventory complexity.`
));
}
return rows;
}
function generate400GPairs(products400g: any[]): Row[] {
const rows: Row[] = [];
if (products400g.length === 0) return rows;
const byFormFactor: Record<string, any[]> = {};
for (const p of products400g) {
if (!byFormFactor[p.form_factor]) byFormFactor[p.form_factor] = [];
byFormFactor[p.form_factor].push(p);
}
for (const [ff, products] of Object.entries(byFormFactor)) {
if (products.length === 0) continue;
const priceList = products.map((p: any) =>
` - ${p.vendor} ${p.part_number} (${p.reach_meters}m${p.wavelengths ? " @ " + p.wavelengths + "nm" : ""}): $${p.price} ${p.currency}`
).join("\n");
const speeds = [...new Set(products.map((p: any) => p.speed_gbps))].sort().join("/");
rows.push(pair(
`What is current market pricing for ${ff} ${speeds}G transceivers? I'm planning a data center upgrade.`,
`Here is the current TIP pricing intelligence for **${ff} ${speeds}G** transceivers (data: ${new Date().toISOString().split("T")[0]}):
**Market pricing:**
${priceList}
**Market context:**
- ${ff === "QSFP-DD" ? "QSFP-DD 400G is the dominant 400G form factor for data center deployments, with 8x50G PAM4 electrical interface" : ff === "QSFP112" ? "QSFP112 uses 4x100G PAM4 lanes, preferred for high-density 400G where thermal budget is critical" : ff === "OSFP" ? "OSFP supports up to 800G and is preferred for AI/ML cluster spine deployments" : `${ff} is a key form factor in next-gen networking deployments`}
- Price points vary significantly by reach: DR4/FR4 (≤2km) is lowest cost; LR4/ER4/ZR (10km+) commands premium
- 400G pricing has compressed 30-40% over the past 18 months as manufacturing volumes increased
For Flexoptix QSFP-DD 400G equivalents, the D.xxx product family covers SR4, DR4, FR4, and LR4 variants with full compatibility guarantees.`
));
}
return rows;
}
function generateVendorOverviewPair(vendorData: any[]): Row {
const vendorList = vendorData.slice(0, 12).map((v: any) =>
` - **${v.vendor}**: ${v.products_with_prices} products, avg $${v.avg_price_usd} (range: $${v.min_price_usd}$${v.max_price_usd})`
).join("\n");
return pair(
`Which compatible optical transceiver vendors does TIP track, and what are their pricing profiles?`,
`TIP tracks real-time pricing across all major compatible transceiver vendors. Here is the current competitive landscape (data: ${new Date().toISOString().split("T")[0]}):
**Vendors with live pricing data:**
${vendorList}
**Vendor tier summary:**
- **Tier 1 (Broad catalog, competitive pricing):** fs.com, 10Gtek, Optcore, Fibertrade — large assortment, aggressive retail pricing, good for 10G/25G/100G commodity items
- **Tier 2 (Specialized/niche):** IntelliPhy, ATGBICS, QSFPTEK — focused on specific form factors or regions
- **B2B Quote-Only:** Eoptolink, Ascent Optics, GAO Tek — no public pricing, volume/contract based
- **OEM/Premium:** Cisco, Juniper, Arista — original vendor pricing, highest cost, lock-in dependent
TIP updates prices continuously via automated scrapers. The compatibility matrix maps these competitor products to Flexoptix FX equivalents with confidence scores.`
);
}
function generateCoverageStatsPair(stats: any): Row {
return pair(
`What is the current scope and coverage of the Transceiver Intelligence Platform database?`,
`The TIP database as of ${new Date().toISOString().split("T")[0]} contains:
**Catalog coverage:**
- **${stats.total_transceivers.toLocaleString()} transceivers** total (all vendors)
- **${stats.fx_products} Flexoptix products** — the reference catalog
- Multiple competitor vendors tracked continuously
**Equivalence matching:**
- **${parseInt(stats.opn_equivalences).toLocaleString()} OPN-confirmed equivalences** (confidence: 1.0) — manufacturer-verified
- **${parseInt(stats.spec_equivalences)} spec-based equivalences** (confidence: 0.85) — algorithmically matched by form factor + speed + reach + wavelength
- Coverage: ~88% of Flexoptix products have at least one confirmed competitor equivalent
**Pricing intelligence:**
- **${parseInt(stats.fresh_prices_7d).toLocaleString()} competitor products with fresh pricing** (updated within 7 days)
- Automated scrapers cover: fs.com, sfpcables.com (10Gtek), Optcore, Fibertrade, ATGBICS, IntelliPhy, and more
- Prices updated continuously via pg-boss job scheduler (24/7 operation)
**Data quality:**
- OPN matches use the official Flexoptix compatibility matrix — same source used by network engineers
- Spec matches use: form_factor + speed_gbps + reach tier (SR/IR/LR/ER/ZR) + wavelength ±10nm
- Safety cap: FX products matching >30 competitors are excluded (too generic, unreliable)`,
"db-coverage"
);
}
// ── Main ──────────────────────────────────────────────────────────────────────
async function main() {
console.log("Generating TIP_LLM pricing training data from DB...\n");
const [priceSummary, opnEquivalences, specEquivalences, vendorData, products400g, stats] = await Promise.all([
getPriceSummaryByTier(),
getOPNEquivalenceExamples(60),
getSpecEquivalenceExamples(40),
getVendorPricingOverview(),
get400GPricingData(),
getCoverageStats(),
]);
console.log(`Price summary rows: ${priceSummary.length}`);
console.log(`OPN equivalence examples: ${opnEquivalences.length}`);
console.log(`Spec equivalence examples: ${specEquivalences.length}`);
console.log(`Vendor overview rows: ${vendorData.length}`);
console.log(`400G+ products: ${products400g.length}`);
const allPairs: Row[] = [
...generatePriceSummaryPairs(priceSummary),
...generateOPNEquivalencePairs(opnEquivalences),
...generateSpecEquivalencePairs(specEquivalences),
...generate400GPairs(products400g),
generateVendorOverviewPair(vendorData),
generateCoverageStatsPair(stats),
];
// Deduplicate by id
const seen = new Set<string>();
const unique = allPairs.filter((r) => {
if (seen.has(r.id)) return false;
seen.add(r.id);
return true;
});
console.log(`\nGenerated ${unique.length} unique training pairs`);
const outDir = join(process.cwd(), "training-data");
mkdirSync(outDir, { recursive: true });
const outPath = join(outDir, "tip-llm-pricing-v1.jsonl");
writeFileSync(outPath, unique.map((r) => JSON.stringify(r)).join("\n") + "\n");
console.log(`\nOutput: ${outPath}`);
console.log(`Training pairs: ${unique.length}`);
await pool.end();
}
main().catch((err) => {
console.error("Fatal:", err);
pool.end();
process.exit(1);
});