Merge remote-tracking branch 'github/main'

# Conflicts:
#	packages/api/src/llm/fo-blog-pipeline.ts
#	packages/api/src/routes/blog.ts
#	packages/scraper/src/scheduler.ts
#	packages/scraper/src/scrapers/fs-com.ts
#	packages/scraper/src/scrapers/gbics.ts
This commit is contained in:
Rene Fichtmueller 2026-04-06 18:03:36 +02:00
commit 51af249361
12 changed files with 1605 additions and 2092 deletions

View File

@ -1,451 +1,441 @@
/** /**
* Blog generation prompt templates v2 (2026-03-28 overhaul) * Blog generation prompt templates v3 (2026-04-04 overhaul)
* *
* Complete rewrite based on field engineer feedback. * Complete rewrite based on real editorial feedback from Gold-standard reviews.
* Previous version produced shallow template text. * v2 produced technically correct but structurally weak articles:
* This version enforces: * - Too many spec dumps (dBm, TX/RX tables)
* - Real-world scenarios with technical depth * - Visible prompt artifacts (section labels, repeated headings)
* - Power budget calculations (mandatory) * - AI transition phrases ("In today's world", "This highlights")
* - CLI examples and DOM readings * - Whitepaper tone instead of human engineering voice
* - Cause-effect explanations, not bullet dumps * - Repetitive concepts across sections
* - Product integration only when contextually relevant
* - Decision logic / diagnosis frameworks
* *
* Multi-pass pipeline: * v3 enforces:
* 1. MASTER pass Full article generation with structure enforcement * - ONE core idea per article, no topic mixing
* 2. DEPTH pass Add concrete values, power budget, CLI examples * - Continuous narrative flow, no visible structure
* 3. ANTI_GENERIC pass Kill marketing language, fix intro * - Experience-driven voice (engineer explaining, not teaching)
* 4. QUALITY_CONTROL pass Final validation against quality gates * - Auto-Kill Layer: removes spec blocks, formulas, AI phrasing
* 5. PROCUREMENT pass (optional) Add cost context for sales audience * - Reduction Engine: 40% cut after generation
* - Hard Delete List: specific phrases banned outright
* *
* Voice: Senior optical network engineer with 10+ years field experience. * Pipeline (8 stages):
* NOT a content writer. NOT marketing. NOT generic AI. * 1. MASTER pass Full article generation
* 2. NARRATIVE CONTROL Enforce continuous flow, kill structure
* 3. AUTO-KILL LAYER Remove spec residue, AI phrases, repetition
* 4. REDUCTION ENGINE Cut 40% (keep strongest version of each idea)
* 5. DEPTH pass Add concrete values where NEEDED (not dumped)
* 6. QUALITY CONTROL Final validation
* 7. PROCUREMENT pass (optional) Cost context for sales audience
* 8. LINKEDIN pass Generate matching LinkedIn post
*
* Voice: Someone explaining a real deployment problem not teaching a class.
*/ */
// ═══════════════════════════════════════════════════════ // ═══════════════════════════════════════════════════════
// SYSTEM PROMPT — Persona & Rules // SYSTEM PROMPT — Persona & Narrative Rules
// ═══════════════════════════════════════════════════════ // ═══════════════════════════════════════════════════════
export const SYSTEM_PROMPT = `You are a senior optical network engineer and technical writer with real field experience in data center, ISP, and DWDM environments. export const SYSTEM_PROMPT = `You are a senior optical network engineer with real field experience in data center, ISP, and DWDM environments.
Your job is to create high-quality, practical, and technically accurate blog articles about optical transceivers and network troubleshooting. You write blog articles for other engineers. Not tutorials. Not whitepapers. Not marketing copy.
Do NOT write generic, shallow, or marketing-style content. Your writing style is calm, direct, and experience-based. You sound like someone explaining a real problem over coffee not presenting at a conference.
Do NOT use buzzwords, filler phrases, or vague explanations.
Write like an experienced engineer explaining real problems to other engineers.
Your content must: VOICE RULES:
- Be technically correct and precise - Write in continuous narrative. No visible sections, no bullet-list articles.
- Include real-world scenarios - One core idea per article. Everything serves that idea.
- Provide actionable troubleshooting steps - Short paragraphs. 2-4 sentences max. White space between thoughts.
- Explain WHY issues happen, not just WHAT to do - Vary sentence length. Mix short punchy lines with longer explanations.
- Include measurements, thresholds, and interpretation - First person where natural ("I've seen this", "we ran into this").
- Reflect field experience (NOC, deployment, escalation cases) - No hedging say what you mean. "is" not "could be", "should" not "might consider".
Reference values you know from experience: WHAT MAKES GOOD CONTENT:
- SFP+ SR: Tx -8.2 to +0.5 dBm, Rx sensitivity -18.0 dBm, alarm below -11.0 dBm - Real operational behavior (what happens when you deploy this)
- QSFP28 LR4: Tx -4.3 to +4.5 dBm, Rx sensitivity -13.7 dBm - One clear narrative thread from start to finish
- QSFP-DD DR4: Tx -2.9 to +3.0 dBm per lane, Rx sensitivity -7.7 dBm - Practical engineering insight from field experience
- 400ZR: Tx -10.0 to +2.0 dBm, Rx sensitivity -21.0 dBm, OSNR > 20 dB required - Natural human tone less polished, less structured, more lived-in
- BER: pre-FEC < 2.4×10^-4 acceptable (KP4 FEC), post-FEC < 10^-15 target - Quiet confidence. No dramatic framing. No false authority.
- CRC errors: > 100/min = dirty fiber, > 10000/min = bad optic or wrong fiber type
- Temperature: COM 0-70°C, IND -40 to +85°C, alarm above 75°C
- Power budget: include Tx power, fiber loss (0.35 dB/km SMF @ 1310nm, 0.22 dB/km @ 1550nm), connector loss (0.3 dB each), splice loss (0.1 dB), margin (3 dB recommended)
CLI examples to use where relevant: HARD DELETE LIST
show interface transceiver details These phrases are BANNED. Never use them:
show interface counters errors - "Let me tell you something"
show interfaces diagnostics optics - "In conclusion"
show ip interface brief - "Let's break this down"
show logging | include transceiver|optics|SFP - "Here's what you need to know"
- "The key takeaway"
- "This highlights"
- "It is important to note"
- "In a real-world scenario"
- "This couldn't be further from the truth"
- "recipe for disaster"
- "ticking time bomb"
- "the numbers don't lie"
- "robust validation strategy"
- "proper cleaning protocols are crucial"
- "significant benefits"
- "cutting-edge"
- "future-proof solution"
- "production-ready and future-proof"
- "real-world implications are far from trivial"
- "In today's rapidly evolving"
- "plays a key role"
- "increasingly important"
- "optimize" / "leverage" / "enhance"
- "consider implementing"
- "may indicate"
- "could potentially"
- "on paper" (unless genuinely needed)
- "in reality" (unless genuinely needed)
ANTI-PATTERNS (STRICTLY FORBIDDEN): SOFT DELETE LIST
- Generic introductions ("In today's fast-paced world", "The optical transceiver market continues") Only keep these if the sentence genuinely needs them:
- Empty phrases ("optimize", "leverage", "enhance", "plays a key role", "increasingly important") - "most of the time"
- Bullet lists without explanation - "usually"
- Random product dumps unconnected to the text - "the problem is"
- Copy-paste datasheet language - "what actually happens"
- Surface-level explanations without cause-effect reasoning - "that's where"
- Placeholders, TODO markers, or unfinished sections - "the issue is not"
If the sentence works without them, drop them.
GOOD style example: AUTO-KILL CATEGORIES
"If Tx drops below -10 dBm on a module rated for -8.2 to +0.5, the laser is degrading. You have maybe 2-4 weeks before it dies completely. Replace now during a maintenance window — don't wait for the 2 AM page." NEVER include any of these in your output:
BAD style to avoid: A) SPEC BLOCKS No TX/RX power values, no dBm ranges, no comparison tables,
"Low power may indicate issues with the transceiver module." no multi-technology spec listings. Keep ONLY operational meaning.
FORMAT RULES: B) FORMULA RESIDUE No optical budget calculations, no attenuation formulas,
- Write in flowing paragraphs, not repetitive bullet lists with identical structure no lane math. Replace with plain-language insight ("margins get tighter",
- Each section should read like an experienced colleague explaining over coffee "less room for mistakes").
- Vary your sentence structure don't start every paragraph the same way
- Tables are fine for reference data, but analysis MUST be narrative
- NEVER use the same template for every item (e.g., don't list "Deployment Reality / Interop / Price / Readiness / Issues" for every technology group and compare instead)
TOPIC SEPARATION (CRITICAL): C) SECTION LEAKAGE No visible section labels like "What breaks in production",
- Strategy/investment articles MUST NOT contain troubleshooting content "Hidden costs nobody mentions", "Vendor bullshit vs reality". Write continuous prose.
- Troubleshooting articles MUST NOT contain investment strategy
- Comparison articles focus on product differences, not operations
- Every article has ONE clear purpose. Do not mix purposes.
OPINION RULES: D) GENERIC TRANSITIONS No "For example", "In today's world", "This means that",
- Have a clear point of view. Neutral advice is worthless. "This is where things get interesting". Just progress directly between ideas.
- Use "is", "will", "should not" instead of "could", "might", "typically"
- Make explicit recommendations: BUY / AVOID / CONSIDER E) REPETITION Each concept appears ONCE, in its strongest form. Never explain
- Before writing, ask: "What decision does the reader make after reading this?" the same thing twice (cleaning, MMF vs SMF, polarity, production vs lab).
- Then write to support exactly that decision.`;
F) SKU MENTIONS No vendor part codes (FX-400DR4-001 etc.) unless the article
is specifically about product comparison.
G) FALSE AUTHORITY No "This is something we see regularly", "Everyone knows",
"The reality hits hard". Calm, experienced, understated.
H) OVER-EXPLAINED BASICS The audience is experienced network engineers.
Don't explain what MMF means. Don't explain what CRC stands for.
I) WHITEPAPER TONE No "It is essential to implement", "A structured
pre-deployment testing strategy", "This enables organizations to",
"best practices", "robust framework".
J) FAKE PRECISION No invented firmware versions, no overly specific costs
unless verified, no "every 45 seconds", no hallucinated numbers.
FORMAT:
- Markdown with horizontal rules (---) as thought breaks
- No H2/H3 within the body title only, then flowing text
- Short paragraphs separated by blank lines
- Tables ONLY for genuine reference data that serves the argument
- End quietly. No "In conclusion". Just stop when you're done.
TOPIC SEPARATION:
- Strategy articles: NO troubleshooting, NO CLI examples
- Troubleshooting articles: NO investment strategy, NO market analysis
- Comparison articles: product differences only, not operations
- Every article has ONE purpose. Do not mix.`;
// ═══════════════════════════════════════════════════════ // ═══════════════════════════════════════════════════════
// MASTER PROMPTS — Per Topic Type // MASTER PROMPTS — Per Topic Type
// ═══════════════════════════════════════════════════════ // ═══════════════════════════════════════════════════════
export const TUTORIAL_PROMPT = `Create a blog article as a practical troubleshooting guide. export const TUTORIAL_PROMPT = `Write a blog article about a real troubleshooting scenario.
Target audience: Start with a moment engineers recognize. Not a textbook scenario something that actually happens. A link that doesn't quite fail but doesn't quite work either. An error counter that creeps up over hours. A module swap that changes nothing.
- Network engineers (mid to senior level)
- Data center operators
- ISP engineers
- Technical buyers with engineering background
STRUCTURE REQUIREMENTS: Then walk through the diagnosis the way it actually happens in the field. Not the clean version from the textbook. The messy version where you check three wrong things before finding the real cause.
1. **Strong Opening (Hook + Scenario)** WHAT TO INCLUDE:
Start with a realistic field scenario (e.g. outage, alert, escalation). - A real scenario as the opening (specific, not generic)
Make it relatable (2 AM, NOC alert, customer escalation). - The diagnostic path including the wrong turns
Clearly define the problem. Include the environment (spine-leaf, DWDM ring, campus core). - What the actual problem turned out to be
Example: "It's 2 AM. NOC pager goes off. Core spine link between pods is flapping — 200G aggregate capacity lost. You SSH into the switch, check the optics, and see Tx power at -14.3 dBm on a module rated for -8.2 to +0.5. The transceiver is dying. Here's how you diagnose this in under 5 minutes." - Why it wasn't obvious
- What to check next time to find it faster
2. **Quick Diagnosis Framework** WHAT NOT TO INCLUDE:
Provide simple decision logic usable under pressure: - Spec sheet dumps (no TX/RX tables, no dBm listings)
- IF link is down check Tx/Rx power if Tx low, replace optic; if Rx low, check fiber - Step-by-step procedures in numbered lists
- IF link is up but BER high check fiber end-faces check fiber type match check power budget - Product recommendations (this is troubleshooting, not sales)
- IF intermittent flapping check temperature check DOM trends over time check fiber routing - Definitions of basic concepts the audience already knows
Make this a clear flowchart in text form.
3. **Deep Dive Sections** (each MUST include): Write as continuous narrative. No section headings within the body.
- Symptoms (specific alarms, log messages, metrics) The article should feel like someone recounting a real experience, not writing a manual.
- Root causes (technical explanation of WHY)
- Measurements (exact Tx, Rx, OSNR, BER values and what they mean)
- Interpretation (how to read DOM output, what values indicate)
- Fix (step-by-step with specific commands)
- "What engineers usually get wrong" insight
Cover these issues: Minimum 800 words. Maximum 1200 words. Shorter is better if it's tighter.`;
a) Low transmit power / dying laser
b) High BER or CRC errors (pre-FEC vs post-FEC)
c) Temperature and environmental problems
d) Fiber type mismatches (SMF vs MMF, wrong wavelength)
e) Coherent (400ZR/ZR+) link issues (if applicable)
4. **Power Budget Section (MANDATORY)** export const HYPE_CYCLE_PROMPT = `Write a blog article with a clear investment position on transceiver technology.
This is the most commonly ignored cause of transceiver issues.
Explain with a concrete example:
- Tx power: X dBm
- Fiber loss: Y km × Z dB/km = A dB
- Connector loss: N connectors × 0.3 dB = B dB
- Splice loss: M splices × 0.1 dB = C dB
- Total loss: A + B + C = D dB
- Rx power: Tx - D = E dBm
- Rx sensitivity: F dBm
- Margin: E - F = G dB (need 3 dB)
Show common mistakes (forgotten patch panels, dirty connectors eating 1-2 dB each).
5. **Tools & Commands** Pick ONE thesis and argue it. Not "here's every speed class and what we think" instead something like "400G is the new 10G" or "800G is not ready and here's why your vendor won't tell you that."
Include real CLI examples with expected output.
Mention physical tools: OTDR, optical power meter, fiber inspection scope, cleaning supplies.
For coherent: spectrum analyzer, OSNR measurement.
6. **Common Mistakes Engineers Make** Start with the thesis. Then support it with what's actually happening in deployments not announcements, not press releases, not vendor roadmaps. What are people actually buying, deploying, and having problems with?
3-5 real mistakes from field experience. Example:
- "Replacing a $2,400 QSFP-DD when the problem is a dirty connector"
- "Using MMF patch cable with an LR optic and wondering why the link won't come up"
- "Ignoring pre-FEC BER trending until post-FEC errors start"
7. **When to Replace the Transceiver vs Fix the Fiber** WHAT TO INCLUDE:
Clear decision criteria with thresholds. - A clear, opinionated thesis in the first few lines
- What's actually shipping vs. what's announced
- Where the cost curves are (direction matters, exact numbers don't)
- What decision this helps the reader make
- A quiet, confident ending not a call to action
8. **Key Takeaways** WHAT NOT TO INCLUDE:
3-5 practical rules engineers can remember under pressure. - Speed-by-speed spec comparisons
- Neutral "it depends on your requirements" advice
- Power consumption tables or per-port wattage breakdowns
- Market size projections or analyst quotes
- Press release language ("revolutionary", "industry-leading")
OUTPUT: Complete, clean markdown. No notes, no placeholders, no generic filler. Minimum 1500 words.`; Write as continuous narrative. One argument flowing through the entire piece.
The reader should finish with a clear point of view they didn't have before.
export const HYPE_CYCLE_PROMPT = `You are a senior optical network architect and industry expert. Minimum 600 words. Maximum 1000 words.`;
Write a blog post that provides clear investment guidance on transceiver speeds. export const COMPARISON_PROMPT = `Write a blog article that helps engineers decide between two or more transceiver options.
TARGET AUDIENCE: Network architects and CTOs making $2M+ infrastructure decisions. They need to decide WHAT to buy, WHEN, and WHY not how transceivers work. Not a feature comparison table. Not "Option A has X, Option B has Y." Instead, tell the story of when each option is the right choice and more importantly, when it's the wrong one.
CRITICAL RULES: Ground it in a real procurement scenario. Someone needs N optics for a deployment. What actually matters when choosing?
- Have a STRONG opinion. Take a clear position.
- Make explicit recommendations: BUY / AVOID / CONSIDER for each speed class.
- Do NOT be neutral. Neutral advice is useless advice.
- Do NOT include troubleshooting content. This is a STRATEGY article.
- Do NOT dump product lists without context. Every product mentioned must serve the argument.
- Focus on BUSINESS IMPACT: cost per Gbit, power per port, rack density, ROI timeline.
- Do NOT mix topics. This is investment guidance. Not a tutorial. Not troubleshooting.
STRUCTURE: WHAT TO INCLUDE:
- A real decision scenario as the framing
- What actually differs in practice (not on the datasheet)
- When the cheaper option is genuinely fine
- When it's not, and why
- The thing most people overlook in this comparison
1. **Provocative Opening** (3-5 sentences) WHAT NOT TO INCLUDE:
Start with a thesis that challenges conventional thinking. - Side-by-side spec tables
Example: "If you're still planning new 100G leaf-spine deployments in 2026, you're designing yesterday's network. The cost per Gbit on 400G QSFP-DD has dropped below 100G QSFP28 when you factor in port density and power. Here's what the numbers actually say." - Per-unit pricing (price direction is fine, exact quotes aren't)
- Vendor marketing claims
- Generic "total cost of ownership" sections
- Troubleshooting advice (this is procurement, not operations)
2. **Market Reality** (2-3 paragraphs) Write as a narrative. The comparison emerges from the story, not from a table.
- AI/ML traffic explosion: east-west traffic in GPU clusters doubling every 12 months
- Hyperscaler trends driving commoditization of 400G
- Enterprise following hyperscale with 2-3 year lag
- Supply chain: where is pricing heading, what's actually available vs announced
3. **Speed-by-Speed Investment Analysis** For EACH speed class, state clearly: Minimum 600 words. Maximum 1000 words.`;
- **Verdict**: BUY / LEGACY / AVOID / EARLY (one word, bold)
- **Cost per Gbit** (actual numbers)
- **Where it makes sense** (specific use case)
- **Where it does NOT make sense** (specific anti-pattern)
Cover these speed classes: export const NEW_PRODUCT_PROMPT = `Write a blog article analyzing a new transceiver product or technology.
- **100G QSFP28** Legacy. Still deployed but declining cost advantage over 400G.
- **200G** Skip tier. Being bypassed in most new designs.
- **400G QSFP-DD/OSFP** Current sweet spot. Best price/performance/maturity balance.
- **800G OSFP/QSFP-DD800** Emerging. AI fabric and hyperscale spine only.
- **1.6T** Watch. Not production-ready.
4. **Investment Decision Matrix** Cut through the announcement. What does this actually change for someone designing a network this quarter? Is this worth evaluating now, or is it a press release for a product that ships in 18 months?
Clear DO / AVOID / CONSIDER framework:
- **DO**: Deploy 400G broadly for leaf-spine. Budget 800G for spine/AI interconnect.
- **AVOID**: New 100G designs. 200G unless forced by existing chassis.
- **CONSIDER**: Infrastructure readiness (fiber quality, power budget, cooling capacity).
5. **Hidden Cost Analysis** (MANDATORY) Start with your verdict don't make the reader scroll to find it.
The optic is 30-40% of the real cost. Include:
- Power consumption per port (W): 400G ~12W, 800G ~18-25W
- Cooling cost: $0.10-0.15 per watt per year in a typical DC
- Fiber infrastructure: SMF for everything >25G, patch panel capacity
- Spares inventory: 5-10% of deployed base
- Engineering time: team training for new form factors
- Calculate a concrete example: "200 ports × 400G at $350/optic + $12W × $0.12/W/yr = $X total over 3 years"
6. **Actionable Recommendations** (3-5 clear statements) WHAT TO INCLUDE:
Each must be specific enough to act on. Not "consider your needs" instead: - Clear verdict up front: deploy now / evaluate / wait / skip
"If deploying a new 32-pod leaf-spine in Q3 2026, use 400G QSFP-DD DR4 for spine and 25G SFP28 for server access. Budget $X per port. Plan 800G spine upgrade for 2028." - What's genuinely new vs. incremental improvement
- Who this is actually for (be specific "AI training clusters with >2000 GPUs", not "enterprises")
- What it replaces and whether the replacement is worth it
- When second-source and pricing pressure arrives (historically)
ANTI-PATTERNS (STRICTLY FORBIDDEN): WHAT NOT TO INCLUDE:
- Mixing in troubleshooting or operational content - Spec sheet rewrites (the datasheet exists, engineers can read it)
- Listing products without explaining WHY they matter for the investment decision - Detailed power/thermal analysis unless that's the whole point
- Being neutral ("it depends") take a position - Feature lists without context
- Generic market statements without numbers - Press release language
- Using "could", "might", "typically" use "is", "will", "should not" - Troubleshooting content
- Referencing products not discussed in the article body
OUTPUT: Complete markdown, minimum 1500 words. No placeholders. No meta-comments.`; Write as narrative. Your opinion should be clear from the first paragraph.
export const COMPARISON_PROMPT = `Write a practical comparison guide for optical transceivers. Minimum 500 words. Maximum 900 words.`;
Target audience: Engineers evaluating options for a specific deployment. // Keep backward compatibility
STRUCTURE:
1. **Opening**: Real procurement/deployment scenario. Example: "You need 200 optics for a new leaf-spine build. The OEM quotes $3,200 per QSFP-DD DR4. A compatible vendor offers the same at $890. Your boss asks: 'What's the catch?' Here's the honest answer."
2. **What Actually Matters** (not spec sheet comparisons):
- Interoperability reality (vendor locking, firmware checks, authentication)
- Power budget differences between vendors (they're not all equal)
- Temperature behavior under load (top-of-rack vs. middle-of-rack)
- DOM accuracy (some compatibles report less accurate readings)
- Warranty and RMA experience
- When "compatible" causes real problems vs. when it works perfectly
3. **Head-to-Head Comparison**
For each product option from the context data:
- Real-world performance (not just datasheet specs)
- Price positioning
- Known issues or advantages
- Best use case
4. **Decision Framework**
- When to buy OEM (mission-critical, specific vendor requirements)
- When compatible is the right choice (cost optimization, proven modules)
- When to avoid specific options (new/untested, poor DOM support)
5. **Total Cost of Ownership**
- Optics cost is only 30-40% of the real cost
- Factor in: spares inventory, RMA turnaround, engineering time, risk
- Include concrete calculations with numbers
6. **Key Takeaways** Decision rules for procurement.
Include specific price ranges and performance data from the context provided.
Do NOT be a shill for any vendor. Be honest about tradeoffs.`;
export const NEW_PRODUCT_PROMPT = `Write a new product analysis article for optical transceivers.
TARGET AUDIENCE: Network architects and procurement engineers deciding whether to adopt a new module NOW or WAIT. They need a clear verdict, not a press release rewrite.
CRITICAL RULES:
- Do NOT rewrite the vendor's spec sheet. Engineers can read datasheets themselves.
- Do NOT include troubleshooting content. This is a product analysis, not an operations guide.
- Have a CLEAR VERDICT: BUY NOW / WAIT / SKIP for each product discussed.
- Every claim must have a number. No "improved performance" say "12W vs 14W previous gen."
- Compare explicitly to the product this replaces. If there's no predecessor, say so.
STRUCTURE:
1. **Provocative Opening** (3-5 sentences)
Cut through the hype. What does this product actually change?
Example: "Another 800G OSFP. The fourth this quarter. Before your vendor's sales rep schedules a 'strategic technology briefing' — here's what's actually different this time, and whether it matters for your network."
2. **What's Actually New vs. Marketing Noise**
- Silicon: same Broadcom/Marvell DSP as competitors, or genuinely new? Which generation?
- Optics: same InP laser, or new EML/VCSEL approach?
- Power: actual module power draw vs. previous generation (watts, not "improved efficiency")
- Thermal: TDP and operating range does this need active cooling?
- Form factor: backward compatible or requires new line cards?
3. **Product Analysis** For EACH product/variant:
| Spec | This Product | Previous Gen | Delta |
Table format with actual numbers.
Then a narrative verdict:
- **BUY NOW** if: [specific scenario with concrete criteria]
- **WAIT** if: [specific scenario what changes in 3-6 months that makes waiting worthwhile]
- **SKIP** if: [specific scenario this product doesn't fit this use case]
4. **The Hidden Costs Nobody Mentions**
The module price is 30-40% of total deployment cost. Include:
- Switch/line card compatibility (which platforms support this TODAY, not "planned")
- Firmware requirements (specific NX-OS/EOS/Junos versions)
- Fiber infrastructure (does this need new fiber types or cleaner connectors?)
- Power budget impact (per-port and per-switch)
- Spares strategy (new products = higher infant mortality, budget 10% spares not 5%)
5. **Procurement Timing**
- Current pricing and where it's heading (based on supply chain data)
- Lead times from OEM vs compatible vendors
- Volume discount thresholds
- When second-source silicon drops prices (historically 6-9 months after launch)
6. **Bottom Line** (3-5 decisive statements)
Not "consider your needs." Instead:
"If you're building a new AI training cluster in Q3 2026, this module is the right choice at $X. If you're running a standard enterprise leaf-spine, skip it — 400G DR4 at $350 does the job at 1/10th the cost."
ANTI-PATTERNS (STRICTLY FORBIDDEN):
- Press release language ("revolutionary", "industry-leading", "next-generation")
- Neutral non-advice ("evaluate based on your requirements")
- Product lists without verdicts
- Mixing in troubleshooting or operational content
- Being nice to vendors who ship bad products
OUTPUT: Complete markdown, minimum 1200 words. No placeholders.`;
// Keep the old MASTER_PROMPT name as alias for backward compatibility
export const MASTER_PROMPT = TUTORIAL_PROMPT; export const MASTER_PROMPT = TUTORIAL_PROMPT;
// ═══════════════════════════════════════════════════════ // ═══════════════════════════════════════════════════════
// REFINEMENT PASSES // REFINEMENT PASSES — Post-Generation Pipeline
// ═══════════════════════════════════════════════════════ // ═══════════════════════════════════════════════════════
export const DEPTH_PROMPT = `Take the existing article and improve it with technical depth. /**
* NARRATIVE CONTROL Enforce continuous flow, kill visible structure
ADD where missing: * Runs FIRST after master generation.
1. Concrete numeric values (exact dBm ranges per form factor, BER thresholds, OSNR requirements) */
2. Power budget calculations (if the article discusses reach or link issues) export const NARRATIVE_CONTROL_PROMPT = `Rewrite this article to read as one continuous narrative.
3. CLI command examples with realistic output snippets
4. Cause-effect explanations (WHY does this happen, not just WHAT to do)
5. Real-world context (what does this look like in a running network)
6. DOM reading interpretation
SPECIFIC ADDITIONS:
- For Tx power: specify exact dBm ranges per form factor
SFP+ SR: -8.2 to +0.5 dBm, alarm at -11.0 dBm
QSFP28 LR4: -4.3 to +4.5 dBm, alarm at -7.0 dBm
QSFP-DD DR4: -2.9 to +3.0 dBm per lane
400ZR: -10.0 to +2.0 dBm (tunable)
- For BER: differentiate pre-FEC vs post-FEC
KP4 FEC threshold: 2.4×10^-4 pre-FEC
Post-FEC target: < 10^-15
Explain: "Corrected errors are expected. Uncorrected errors mean the FEC can't keep up — that's when you page the on-call."
- For coherent: OSNR requirements per speed
100G DP-QPSK: 12 dB minimum
400G 16QAM: 20 dB minimum
800G: 24 dB minimum
- For temperature: why top-of-rack runs hotter, impact on laser lifetime
REMOVE: REMOVE:
- Vague statements ("may indicate issues", "consider checking") - All H2/H3 headings within the body (keep only the title)
- Generic filler that adds no technical value - All numbered lists that read like procedures
- Redundant explanations already covered elsewhere in the article - All bullet lists that should be prose
- All visible section labels ("What breaks", "The real cost", "Key takeaways")
- All repeated structural patterns (don't use the same format for each point)
Do NOT make the text longer unless it adds real technical value. RESTRUCTURE:
Preserve the markdown structure. - Convert lists into flowing paragraphs
Keep the engineer voice direct, confident, slightly opinionated.`; - Use horizontal rules (---) as thought breaks between major shifts
- Vary paragraph length mix 1-sentence paragraphs with 3-4 sentence ones
- Make transitions invisible the next thought should follow naturally
export const ANTI_GENERIC_INTRO_PROMPT = `Rewrite the introduction of this article. The text should feel like someone talking, not someone presenting slides.
KILL any generic or marketing-style opening. Engineers close the tab immediately if they see: Return the complete rewritten article. Preserve the core content and insights.`;
- "In today's rapidly evolving network landscape"
- "Optical transceivers play a key role"
- "As data center bandwidth demands increase"
- Any sentence that could apply to any article about any topic
REPLACE WITH a real scenario that the reader immediately recognizes from their own experience. /**
Make the reader feel "this person has been in my shoes." * AUTO-KILL LAYER Remove all patterns that make text feel generated
Include specific technical details in the opening (model names, dBm values, error counts). * This is the most critical pass. It catches everything the master prompt missed.
*/
export const AUTO_KILL_PROMPT = `Clean this article with the Auto-Kill Layer.
The intro should be 3-5 sentences maximum. Get to the point. Delete or rewrite anything that feels like:
- Data sheet residue (raw spec values, dBm ranges, TX/RX numbers)
- Formula residue (calculations, equations, budget math)
- Section leakage (visible module labels, "What breaks in production")
- Generic AI transitions ("For example", "This means that", "This highlights")
- Repeated concepts (same idea explained twice in different sections)
- SKU mentions (vendor part codes like FX-400DR4-001)
- Exaggerated authority ("This is something we see regularly", "Let me tell you")
- Over-explained basics (defining terms the audience already knows)
- Whitepaper language ("It is essential to", "A structured strategy", "best practices")
- Fake precision (invented firmware versions, unverifiable exact costs)
- Dramatic framing ("ticking time bomb", "recipe for disaster", "the numbers don't lie")
Example of a great opening: HARD DELETE Remove these phrases entirely if found:
"It's 2 AM. NOC pager goes off. Core spine link between pods is flapping — 200G aggregate capacity lost. You SSH into the switch, check the optics, and see Tx power at -14.3 dBm on a module rated for -8.2 to +0.5. The transceiver is dying. Here's how you diagnose this in under 5 minutes." "Let me tell you something", "In conclusion", "Let's break this down",
"Here's what you need to know", "The key takeaway", "This highlights",
"It is important to note", "In a real-world scenario", "recipe for disaster",
"ticking time bomb", "the numbers don't lie", "robust validation strategy",
"proper cleaning protocols are crucial", "significant benefits", "cutting-edge",
"future-proof solution", "increasingly important", "plays a key role"
Return the complete article with the fixed introduction. Do not change the rest.`; Keep ONLY:
- Real operational behavior
- One clear narrative
- Practical engineering insight
- Natural human tone
export const QUALITY_CONTROL_PROMPT = `Check this article for the following issues and fix ALL of them: The text must feel less polished, less structured, and more lived-in.
QUALITY GATES (every article MUST pass): Return the complete cleaned article.`;
1. NUMERIC VALUES Every technical claim MUST have a number attached. /**
BAD: "Low power indicates a problem" * REDUCTION ENGINE Cut 40% of the text
GOOD: "Tx below -11.0 dBm on a 10G SR module means the laser is degrading" * Brevity is the goal. Every sentence must earn its place.
*/
export const REDUCTION_PROMPT = `Cut this article by 40%.
2. GENERIC PHRASES Kill all of these: REMOVE:
"plays a key role", "increasingly important", "it is important to note", - Repetition (keep only the strongest version of each idea)
"in today's rapidly evolving", "optimize", "leverage", "enhance", - Secondary explanations that add nothing new
"consider implementing", "may indicate", "could potentially" - "Nice to have" details that don't serve the core argument
Replace with direct, specific statements. - Sentences that exist only because they sound complete
- Any paragraph that could be removed without losing the thread
3. PLACEHOLDER TEXT Zero tolerance for TODO, NOTE, FIXME, <!-- -->, or incomplete sections. KEEP:
- The core argument / thesis
- The strongest anecdote or example
- Sentences that change the reader's understanding
- The opening hook
- The quiet closing
4. EMPTY SECTIONS Every H2/H3 section must have at least 100 words of substantive content. After cutting, read it back. If any sentence feels like filler, cut it too.
5. POWER BUDGET If the article discusses fiber links or reach, there MUST be a power budget calculation. The best version of this article is the shortest one that still lands.
6. CLI EXAMPLES At least 2 real CLI commands in the article. Return the complete reduced article.`;
7. CAUSE-EFFECT Every "do X" must explain WHY. No unexplained instructions. /**
* DEPTH PASS Add technical substance WHERE NEEDED
* v3 change: No longer dumps specs. Only adds depth where the text is vague.
*/
export const DEPTH_PROMPT = `Review this article for vague claims that need specifics.
8. PRODUCT INTEGRATION Products are mentioned ONLY when they solve a specific problem discussed in the article. No random product dumps. ONLY add detail where the text makes a claim without backing it up.
9. INTRODUCTION Must start with a scenario, NOT with "The optical transceiver market..." GOOD addition: Replacing "margins get tighter" with "at 400G, a connector that added
0.5 dB of loss invisible at 100G eats into a budget that's already half as generous"
10. MINIMUM DEPTH Article must be at least 1200 words. If under that, add depth to existing sections (don't add filler). BAD addition: Inserting a TX power range table or a power budget calculation
For each issue found, rewrite the affected section to fix it.
Return the complete fixed article in markdown.`;
/** Optional procurement-focused notes for sales/customer audience */
export const PROCUREMENT_LAYER_PROMPT = `Add short procurement-focused notes where relevant in this article.
Rules: Rules:
- Maximum 1-2 sentences per note, woven naturally into the text - Add specifics that support the narrative, not spec blocks
- Focus on cost of misdiagnosis and unnecessary replacements - If a claim is already clear without numbers, leave it alone
- Mention price context only when it helps the reader make better decisions - Never add CLI examples unless the article is explicitly a troubleshooting guide
- Keep the engineer voice you're helping them save money, not selling - Never add comparison tables
- Keep the human voice additions must sound natural, not inserted
Good example: Return the complete article with additions woven in naturally.`;
"Before RMA'ing a $2,400 QSFP-DD module, clean the fiber end-face. In our experience, 40% of RMA'd optics test perfectly fine at the vendor — the problem was contaminated connectors."
Another example: /**
"A compatible QSFP28 LR4 runs $180 vs $1,100 for the OEM version. If your switch doesn't do vendor locking (most modern ones don't), there's no technical reason to pay 6x more." * ANTI-GENERIC INTRO Fix weak openings
* Kept from v2 but simplified.
*/
export const ANTI_GENERIC_INTRO_PROMPT = `Rewrite only the first 3-5 sentences of this article.
Do NOT turn this into marketing content. Keep the engineer voice. The opening must be a moment the reader recognizes from their own experience.
Return the complete article with the notes added.`; Not a market overview. Not a definition. Not a rhetorical question.
Something specific happened. Start there.
Return the complete article with only the introduction changed.`;
/**
* QUALITY CONTROL Final validation
* Simplified from v2. Checks for Auto-Kill failures.
*/
export const QUALITY_CONTROL_PROMPT = `Final quality check. Fix any remaining issues:
1. Any phrases from the Hard Delete List still present? Remove them.
2. Any spec blocks (dBm values, TX/RX tables) still present? Remove them.
3. Any visible section headings within the body? Remove them (keep title only).
4. Any repeated ideas? Keep only the stronger version.
5. Any numbered procedure lists? Convert to narrative.
6. Any whitepaper language? Rewrite in plain engineering voice.
7. Does the article have ONE clear purpose? If it drifts, cut the drift.
8. Is the ending quiet and confident? No "In conclusion", no call to action.
9. Word count check: if over 1200 words, cut more. Shorter is better.
Return the complete fixed article.`;
/** Optional procurement notes for sales/customer audience */
export const PROCUREMENT_LAYER_PROMPT = `Add 1-2 short cost-context notes where they naturally fit.
Rules:
- Maximum 1 sentence each, woven into the existing flow
- Focus on cost of misdiagnosis or the real price difference
- Keep the engineer voice you're helping them avoid waste, not selling
- If there's no natural place for cost context, don't force it
Return the complete article with notes added naturally.`;
/**
* LINKEDIN POST Generate matching LinkedIn post
* New in v3. Every blog gets a LinkedIn companion.
*/
export const LINKEDIN_PROMPT = `Write a LinkedIn post for this blog article.
Rules:
- 6-10 lines maximum
- Start with the single strongest insight from the article
- No bullet lists
- No spec values
- No dramatic framing
- End with "Full breakdown in the blog — link in first comment."
- Add 4-5 relevant hashtags (always include #Flexoptix)
- The post should make someone stop scrolling and want to read the full article
Do NOT summarize the article. Pick the one thing that would surprise someone
and lead with that.`;
// ═══════════════════════════════════════════════════════
// SCORING — Post-pipeline quality assessment
// ═══════════════════════════════════════════════════════
export const SCORING_PROMPT = `Score this article from 1-10 on each dimension:
1. CLEANLINESS No spec residue, no formula residue, no AI phrases
2. NARRATIVE CONTINUITY Reads as one continuous thought, not assembled modules
3. NON-AI FEEL Would a reader think a person wrote this, not an LLM?
4. OPERATIONAL RELEVANCE Does this help an engineer make a better decision?
For each score below 8, list what should still be removed or rewritten.
Return ONLY the scores and issues as JSON:
{"cleanliness": N, "narrative": N, "non_ai": N, "relevance": N, "issues": ["..."]}`;
// ═══════════════════════════════════════════════════════ // ═══════════════════════════════════════════════════════
// VIRAL & SIGNAL PASS — Flexoptix Social Masterfile v1.0 // VIRAL & SIGNAL PASS — Flexoptix Social Masterfile v1.0
@ -591,38 +581,34 @@ export function buildTopicPrompt(
parts.push(NEW_PRODUCT_PROMPT); parts.push(NEW_PRODUCT_PROMPT);
} }
// Append gathered data as context — clearly separated // Append gathered data as MINIMAL context — not to be dumped into the article
if (data.products.length > 0) { if (data.products.length > 0) {
parts.push("\n\n--- PRODUCT DATA (use as reference, integrate contextually — do NOT list randomly) ---"); parts.push("\n\n--- CONTEXT DATA (use as background knowledge, do NOT list or dump into article) ---");
for (const p of data.products.slice(0, 15)) { for (const p of data.products.slice(0, 10)) {
const price = p.price ? `, ~€${p.price}` : ""; parts.push(`${p.standard_name || p.slug}: ${p.form_factor} ${p.speed}, ${p.reach_label || ""}, ${p.vendor || ""}`);
parts.push(`${p.standard_name || p.slug}: ${p.form_factor} ${p.speed}, reach ${p.reach_label || "N/A"}, fiber ${p.fiber_type || "N/A"}, vendor ${p.vendor || "N/A"}${price}`);
} }
} }
if (data.news.length > 0) { if (data.news.length > 0) {
parts.push("\n\n--- RECENT INDUSTRY NEWS (reference only if genuinely relevant to the topic) ---"); parts.push("\n\n--- RECENT NEWS (reference only if genuinely relevant to the narrative) ---");
for (const n of data.news.slice(0, 5)) { for (const n of data.news.slice(0, 3)) {
parts.push(`${n.title} (${n.source || "unknown"}, ${n.date || "recent"})`); parts.push(`${n.title} (${n.source || "unknown"})`);
} }
} }
// Only include troubleshooting data for tutorial/troubleshooting articles // Troubleshooting data only for tutorial articles
// Strategy articles (hype_cycle, comparison, new_product) must NOT mix in troubleshooting
if (topic === "tutorial" && data.troubleshooting.length > 0) { if (topic === "tutorial" && data.troubleshooting.length > 0) {
parts.push("\n\n--- TROUBLESHOOTING DATA (incorporate into relevant sections with full context) ---"); parts.push("\n\n--- TROUBLESHOOTING CONTEXT (weave into narrative, do NOT list as procedures) ---");
for (const t of data.troubleshooting) { for (const t of data.troubleshooting.slice(0, 3)) {
parts.push(`• Symptom: ${t.symptom}`); parts.push(`${t.symptom}${t.cause}${t.solution}`);
parts.push(` Cause: ${t.cause}`);
parts.push(` Fix: ${t.solution}`);
} }
} }
// FAQ data only for tutorials and comparisons // FAQ only for tutorials
if ((topic === "tutorial" || topic === "comparison") && data.faq.length > 0) { if (topic === "tutorial" && data.faq.length > 0) {
parts.push("\n\n--- FAQ DATA (address these questions naturally in the article flow) ---"); parts.push("\n\n--- FAQ CONTEXT (address naturally in flow, do NOT create Q&A section) ---");
for (const f of data.faq.slice(0, 5)) { for (const f of data.faq.slice(0, 3)) {
parts.push(`Q: ${f.question} → A: ${f.answer}`); parts.push(`${f.question}`);
} }
} }

File diff suppressed because it is too large Load Diff

View File

@ -16,22 +16,21 @@ import { pool } from "../db/client";
const pipelineProgress = new Map<string, { step: number; total: number; label: string; pct: number }>(); const pipelineProgress = new Map<string, { step: number; total: number; label: string; pct: number }>();
function setProgress(draftId: string, step: number, label: string): void { function setProgress(draftId: string, step: number, label: string): void {
const pct = Math.round((step / 18) * 92) + 2; // 2%..94% during run, 100% on complete const pct = Math.round((step / 16) * 92) + 2; // 2%..94% during run, 100% on complete
pipelineProgress.set(draftId, { step, total: 17, label, pct }); pipelineProgress.set(draftId, { step, total: 16, label, pct });
} }
function clearProgress(draftId: string): void { function clearProgress(draftId: string): void {
pipelineProgress.delete(draftId); pipelineProgress.delete(draftId);
} }
import { semanticSearch } from "../embeddings/client"; import { semanticSearch } from "../embeddings/client";
import { generate, checkHealth, resetOllamaQueue, resetClaudeQueue, getQueueDepth } from "../llm/client"; import { generate, checkHealth, resetOllamaQueue, getQueueDepth } from "../llm/client";
import { import {
SYSTEM_PROMPT, SYSTEM_PROMPT,
DEPTH_PROMPT, DEPTH_PROMPT,
ANTI_GENERIC_INTRO_PROMPT, ANTI_GENERIC_INTRO_PROMPT,
QUALITY_CONTROL_PROMPT, QUALITY_CONTROL_PROMPT,
PROCUREMENT_LAYER_PROMPT, PROCUREMENT_LAYER_PROMPT,
VIRAL_SIGNAL_PROMPT,
buildTopicPrompt, buildTopicPrompt,
} from "../llm/blog-prompts"; } from "../llm/blog-prompts";
@ -334,12 +333,11 @@ function validateArticle(content: string): string[] {
} }
// Check minimum depth // Check minimum depth
const wordCount = content.split(/\s+/).length; const wordCount = content.split(/\s+/).length;
if (wordCount < 1200) { if (wordCount < 800) {
issues.push(`Too short: ${wordCount} words (minimum 1200)`); issues.push(`Too short: ${wordCount} words (minimum 800 for template, 1200 for LLM)`);
} }
// Check for power budget only in articles primarily about troubleshooting (title contains it) // Check for power budget section in troubleshooting articles
const titleLine = content.split("\n")[0]?.toLowerCase() || ""; if (content.toLowerCase().includes("troubleshoot") && !content.toLowerCase().includes("power budget")) {
if (titleLine.includes("troubleshoot") && !content.toLowerCase().includes("power budget")) {
issues.push("Missing power budget section"); issues.push("Missing power budget section");
} }
@ -1003,28 +1001,22 @@ async function runLlmPipeline(
STEP5_REALITY_INJECTION, STEP5_REALITY_INJECTION,
STEP6_TECHNICAL_DEEPENING, STEP6_TECHNICAL_DEEPENING,
STEP7_OPINION_LAYER, STEP7_OPINION_LAYER,
STEP_AFE,
STEP8_KILL_AI_TONE, STEP8_KILL_AI_TONE,
STEP8b_REDUCTION, STEP8b_REDUCTION,
STEP_AEM,
STEP8c_STYLE_LOCK, STEP8c_STYLE_LOCK,
STEP8d_AUTO_KILL,
AUTO_KILL_SCORING,
STEP9_QA_CHECK, STEP9_QA_CHECK,
STEP10_QUALITY_SCORE, STEP10_QUALITY_SCORE,
STEP_APM,
STEP_LINKEDIN_POST, STEP_LINKEDIN_POST,
BLOG_TYPES, BLOG_TYPES,
buildFeedbackContext, buildFeedbackContext,
buildSLLContext,
withCalibration, withCalibration,
STEP0_TITLE_CONTRACT,
STEP_TECHNICAL_SANITY,
STEP_SELF_HEAL,
STEP_TITLE_CONTRACT_CHECK,
} = await import("../llm/fo-blog-pipeline"); } = await import("../llm/fo-blog-pipeline");
const LLM_OPTS = { temperature: 0.7, maxTokens: 8192, timeoutMs: 480000 }; const LLM_OPTS = { temperature: 0.7, maxTokens: 8192, timeoutMs: 480000 };
const LLM_REFINE = { temperature: 0.4, maxTokens: 6144, timeoutMs: 480000 }; const LLM_REFINE = { temperature: 0.4, maxTokens: 6144, timeoutMs: 480000 };
const TOTAL_STEPS = 21; // 17-step pipeline + title contract + technical sanity + self-heal + contract check const TOTAL_STEPS = 16; // 10 original + 4b Narrative Control + 8b Reduction + 8c Style Lock + 8d Auto-Kill + Auto-Kill Score + LinkedIn
let stepsCompleted = 0; let stepsCompleted = 0;
try { try {
@ -1044,14 +1036,7 @@ async function runLlmPipeline(
}))); })));
} catch { /* no feedback yet, that's fine */ } } catch { /* no feedback yet, that's fine */ }
// Load SLL learned patterns (safe-fails if no data yet) const systemPrompt = withCalibration(FO_BLOG_SYSTEM_PROMPT + feedbackContext);
let sllContext = "";
try {
sllContext = await buildSLLContext();
if (sllContext) console.log(" SLL: Learned patterns injected into system prompt");
} catch { /* no SLL data yet, fine */ }
const systemPrompt = withCalibration(FO_BLOG_SYSTEM_PROMPT + feedbackContext + sllContext);
// Warmup // Warmup
await generate("Test", "OK", { temperature: 0.1, maxTokens: 8, timeoutMs: 60000 }).catch(() => {}); await generate("Test", "OK", { temperature: 0.1, maxTokens: 8, timeoutMs: 60000 }).catch(() => {});
@ -1102,45 +1087,11 @@ async function runLlmPipeline(
// Get blog type config // Get blog type config
const blogType = BLOG_TYPES[selectedTopic as keyof typeof BLOG_TYPES] || BLOG_TYPES.tutorial; const blogType = BLOG_TYPES[selectedTopic as keyof typeof BLOG_TYPES] || BLOG_TYPES.tutorial;
// Load existing articles to prevent angle repetition
let existingAnglesContext = "";
let forbiddenAnglesContext = "";
try {
const existingResult = await pool.query(
`SELECT title, draft_content FROM blog_drafts
WHERE status IN ('published', 'review', 'ready') AND draft_content IS NOT NULL
ORDER BY created_at DESC LIMIT 10`
);
if (existingResult.rows.length > 0) {
const summaries = existingResult.rows.map((r: { title: string; draft_content: string }) => {
// Extract first 150 chars of content as summary
const preview = (r.draft_content || "").replace(/^#[^\n]*\n/, "").trim().slice(0, 150);
return `- "${r.title}": ${preview}...`;
}).join("\n");
existingAnglesContext = `\n\nALREADY PUBLISHED ARTICLES (do NOT repeat their angles or structure):\n${summaries}\n\nFor this new article, choose a COMPLETELY DIFFERENT perspective and angle than any of the above.`;
forbiddenAnglesContext = `ALREADY WRITTEN ANGLES (forbidden — do not repeat these):\n${existingResult.rows.map((r: { title: string }) => `- "${r.title}"`).join("\n")}\n\nThe new article MUST have a structurally different angle — different story type, different reader takeaway, different perspective lens.\n`;
}
} catch { /* fine if no articles yet */ }
// ═══ STEP 0: Title Contract — bind LLM to headline promise ═══
console.log(" Step 0: Title Contract (binding headline to content)...");
setProgress(draftId, 1, "Step 0: Title Contract");
const step0 = await generate(systemPrompt,
STEP0_TITLE_CONTRACT.replace("{{TITLE}}", title),
{ ...LLM_REFINE, maxTokens: 2048 }
);
const titleContract = step0.text;
console.log(` Title Contract: ${titleContract.split("\n").slice(0, 3).join(" | ").slice(0, 120)}...`);
// ═══ STEP 1: Topic Expansion ═══ // ═══ STEP 1: Topic Expansion ═══
console.log(" Step 1: Topic Expansion..."); console.log(" Step 1/10: Topic Expansion...");
setProgress(draftId, 2, "Step 1: Topic Expansion"); setProgress(draftId, 1, "Step 1/10: Topic Expansion");
const step1 = await generate(systemPrompt, const step1 = await generate(systemPrompt,
STEP1_TOPIC_EXPANSION STEP1_TOPIC_EXPANSION.replace("{{TOPIC}}", title),
.replace("{{TOPIC}}", title)
.replace("{{EXISTING_ANGLES}}", existingAnglesContext + "\n\nTITLE CONTRACT (the article MUST fulfill this):\n" + titleContract),
LLM_OPTS LLM_OPTS
); );
stepsCompleted = 1; stepsCompleted = 1;
@ -1149,9 +1100,7 @@ async function runLlmPipeline(
console.log(" Step 2/10: Angle Selection..."); console.log(" Step 2/10: Angle Selection...");
setProgress(draftId, 2, "Step 2/10: Angle Selection"); setProgress(draftId, 2, "Step 2/10: Angle Selection");
const step2 = await generate(systemPrompt, const step2 = await generate(systemPrompt,
STEP2_ANGLE_SELECTION STEP2_ANGLE_SELECTION.replace("{{SCENARIOS}}", step1.text),
.replace("{{FORBIDDEN_ANGLES}}", forbiddenAnglesContext + "\nTITLE CONTRACT:\n" + titleContract)
.replace("{{SCENARIOS}}", step1.text),
LLM_REFINE LLM_REFINE
); );
stepsCompleted = 2; stepsCompleted = 2;
@ -1174,7 +1123,6 @@ async function runLlmPipeline(
const step4 = await generate(systemPrompt, const step4 = await generate(systemPrompt,
STEP4_MASTER_DRAFT STEP4_MASTER_DRAFT
.replace("{{OUTLINE}}", step3.text) .replace("{{OUTLINE}}", step3.text)
.replace("{{TITLE_CONTRACT_INJECT}}", "TITLE CONTRACT FOR THIS ARTICLE (BINDING — every paragraph must serve this promise):\n" + titleContract)
.replace("{{CONTEXT_DATA}}", contextData), .replace("{{CONTEXT_DATA}}", contextData),
{ ...LLM_OPTS, maxTokens: 8192 } { ...LLM_OPTS, maxTokens: 8192 }
); );
@ -1201,88 +1149,77 @@ async function runLlmPipeline(
stepsCompleted = 6; stepsCompleted = 6;
// ═══ STEP 6: Technical Deepening ═══ // ═══ STEP 6: Technical Deepening ═══
console.log(" Step 7/16: Technical Deepening..."); console.log(" Step 7/13: Technical Deepening...");
setProgress(draftId, 7, "Step 7/16: Technical Deepening"); setProgress(draftId, 7, "Step 7/13: Technical Deepening");
const step6 = await generate(systemPrompt, const step6 = await generate(systemPrompt,
STEP6_TECHNICAL_DEEPENING.replace("{{ARTICLE}}", step5.text), STEP6_TECHNICAL_DEEPENING.replace("{{ARTICLE}}", step5.text),
LLM_REFINE LLM_REFINE
); );
stepsCompleted = 7; stepsCompleted = 6;
// ═══ STEP 7: Opinion Layer ═══ // ═══ STEP 7: Opinion Layer ═══
console.log(" Step 8/16: Opinion Layer..."); console.log(" Step 8/13: Opinion Layer...");
setProgress(draftId, 8, "Step 8/16: Opinion Layer"); setProgress(draftId, 8, "Step 8/13: Opinion Layer");
const step7 = await generate(systemPrompt, const step7 = await generate(systemPrompt,
STEP7_OPINION_LAYER.replace("{{ARTICLE}}", step6.text), STEP7_OPINION_LAYER.replace("{{ARTICLE}}", step6.text),
LLM_REFINE LLM_REFINE
); );
stepsCompleted = 8; stepsCompleted = 8;
// ═══ STEP AFE: Auto-Focus Enforcer (ONE idea, ONE scenario, kill drift) ═══ // ═══ STEP 8: Kill AI Tone ═══
console.log(" Step 9/16: Auto-Focus Enforcer (kill multi-topic drift)..."); console.log(" Step 9/13: Kill AI Tone...");
setProgress(draftId, 9, "Step 9/16: Auto-Focus Enforcer"); setProgress(draftId, 9, "Step 9/13: Kill AI Tone");
const stepAFE = await generate(systemPrompt, const step8 = await generate(systemPrompt,
STEP_AFE.replace("{{ARTICLE}}", step7.text), STEP8_KILL_AI_TONE.replace("{{ARTICLE}}", step7.text),
LLM_REFINE LLM_REFINE
); );
stepsCompleted = 9; stepsCompleted = 9;
const wordsAFE = stepAFE.text.split(/\s+/).length;
const wordsBeforeAFE = step7.text.split(/\s+/).length;
const pctAFE = Math.round((1 - wordsAFE / wordsBeforeAFE) * 100);
if (pctAFE > 5) console.log(` AFE cut: ${wordsBeforeAFE}${wordsAFE} words (${pctAFE}%) — drift removed`);
// ═══ STEP 8: Kill AI Tone ═══ // ═══ STEP 8b: Reduction Engine (5-pass, target: cut 40%) ═══
console.log(" Step 10/16: Kill AI Tone..."); console.log(" Step 10/16: Reduction Engine (5-pass, cut 40%, target 600-1000 words)...");
setProgress(draftId, 10, "Step 10/16: Kill AI Tone"); setProgress(draftId, 10, "Step 10/16: Reduction Engine (cut 40%)");
const step8 = await generate(systemPrompt,
STEP8_KILL_AI_TONE.replace("{{ARTICLE}}", stepAFE.text),
LLM_REFINE
);
stepsCompleted = 10;
// ═══ STEP 8b: Reduction Engine (5-pass: Repetition Kill → Tech Prune → Flow Rebuild → Weight Correction → Humanization) ═══
console.log(" Step 11/16: Reduction Engine (5-pass, target 700-1000 words)...");
setProgress(draftId, 11, "Step 11/16: Reduction Engine");
const step8b = await generate(systemPrompt, const step8b = await generate(systemPrompt,
STEP8b_REDUCTION.replace("{{ARTICLE}}", step8.text), STEP8b_REDUCTION.replace("{{ARTICLE}}", step8.text),
LLM_REFINE LLM_REFINE
); );
stepsCompleted = 11; stepsCompleted = 10;
const wordsAfter = step8b.text.split(/\s+/).length; const wordsAfter = step8b.text.split(/\s+/).length;
const wordsBefore = step8.text.split(/\s+/).length; const wordsBefore = step8.text.split(/\s+/).length;
const pctChange = Math.round((1 - wordsAfter / wordsBefore) * 100); const pctChange = Math.round((1 - wordsAfter / wordsBefore) * 100);
console.log(` After reduction: ${wordsAfter} words (was ${wordsBefore}, ${pctChange}%) ${wordsAfter > 2000 ? "⚠ WARNING: >2000 words" : wordsAfter < 1000 ? "⚠ WARNING: <1000 words" : "✓ in target range"}`); console.log(` After reduction: ${wordsAfter} words (was ${wordsBefore}, ${pctChange}%) ${wordsAfter > 1200 ? "⚠ WARNING: >1200 words" : wordsAfter < 500 ? "⚠ WARNING: <500 words" : "✓ in target range"}`);
// ═══ STEP AEM: Auto-Editor Mode (Senior Engineer voice polish) ═══ // ═══ STEP 8c: Style Lock ═══
console.log(" Step 12/16: Auto-Editor Mode (senior engineer voice polish)..."); console.log(" Step 11/16: Style Lock (tone consistency + scope/SKU fixes)...");
setProgress(draftId, 12, "Step 12/16: Auto-Editor Mode"); setProgress(draftId, 11, "Step 11/16: Style Lock");
const stepAEM = await generate(systemPrompt, const step8c = await generate(systemPrompt,
STEP_AEM.replace("{{ARTICLE}}", step8b.text), STEP8c_STYLE_LOCK.replace("{{ARTICLE}}", step8b.text),
LLM_REFINE
);
stepsCompleted = 11;
// ═══ STEP 8d: Auto-Kill Layer v1.0 (10 categories A-J) ═══
console.log(" Step 12/16: Auto-Kill Layer (10 categories A-J)...");
setProgress(draftId, 12, "Step 12/16: Auto-Kill Layer");
const step8d = await generate(systemPrompt,
STEP8d_AUTO_KILL.replace("{{ARTICLE}}", step8c.text),
LLM_REFINE LLM_REFINE
); );
stepsCompleted = 12; stepsCompleted = 12;
const wordsAfterKill = step8d.text.split(/\s+/).length;
console.log(` After Auto-Kill: ${wordsAfterKill} words (was ${step8c.text.split(/\s+/).length})`);
// ═══ STEP 8c: Style Lock ═══ // ═══ STEP 9: QA Check ═══
console.log(" Step 13/16: Style Lock (tone consistency + scope/SKU fixes)..."); console.log(" Step 13/16: QA Check...");
setProgress(draftId, 13, "Step 13/16: Style Lock"); setProgress(draftId, 13, "Step 13/16: QA Check");
const step8c = await generate(systemPrompt, const step9 = await generate(systemPrompt,
STEP8c_STYLE_LOCK.replace("{{ARTICLE}}", stepAEM.text), STEP9_QA_CHECK.replace("{{ARTICLE}}", step8d.text),
LLM_REFINE LLM_REFINE
); );
stepsCompleted = 13; stepsCompleted = 13;
// ═══ STEP 9: QA Check ═══
console.log(" Step 14/16: QA Check...");
setProgress(draftId, 14, "Step 14/16: QA Check");
const step9 = await generate(systemPrompt,
STEP9_QA_CHECK.replace("{{ARTICLE}}", step8c.text),
LLM_REFINE
);
stepsCompleted = 14;
// ═══ STEP 10: Quality Score ═══ // ═══ STEP 10: Quality Score ═══
console.log(" Step 15/16: Quality Score..."); console.log(" Step 14/16: Quality Score...");
setProgress(draftId, 15, "Step 15/16: Quality Score"); setProgress(draftId, 14, "Step 14/16: Quality Score");
let autoQaScore: Record<string, unknown> | null = null; let autoQaScore: Record<string, unknown> | null = null;
try { try {
const step10 = await generate(systemPrompt, const step10 = await generate(systemPrompt,
@ -1298,152 +1235,55 @@ async function runLlmPipeline(
} catch { } catch {
console.log(" Quality scoring skipped (parse error)"); console.log(" Quality scoring skipped (parse error)");
} }
stepsCompleted = 14;
// ═══ Auto-Kill Scoring (non-destructive) ═══
console.log(" Step 15/16: Auto-Kill Scoring...");
setProgress(draftId, 15, "Step 15/16: Auto-Kill Scoring");
let autoKillScores: Record<string, unknown> | null = null;
try {
const killScoreResult = await generate(systemPrompt,
AUTO_KILL_SCORING.replace("{{ARTICLE}}", step9.text),
{ temperature: 0.2, maxTokens: 512, timeoutMs: 60000 }
);
const killJson = killScoreResult.text.match(/\{[\s\S]*\}/);
if (killJson) {
autoKillScores = JSON.parse(killJson[0]);
console.log(` Auto-Kill Scores: ${JSON.stringify(autoKillScores)}`);
}
} catch {
console.log(" Auto-Kill scoring skipped");
}
stepsCompleted = 15; stepsCompleted = 15;
// ═══ STEP APM: Auto-Precision Mode (Final Cut — last filter before publish) ═══ // ═══ LinkedIn Post ═══
console.log(" Step 16/18: Auto-Precision Mode (final cut — if a word can go, it must go)..."); console.log(" Step 16/16: LinkedIn Post (max 2,800 chars)...");
setProgress(draftId, 16, "Step 16/18: Auto-Precision Mode"); setProgress(draftId, 16, "Step 16/16: LinkedIn Post");
const stepAPM = await generate(systemPrompt, let linkedinPost: string | null = null;
STEP_APM.replace("{{ARTICLE}}", step9.text), let linkedinCharCount: number | null = null;
LLM_REFINE
);
stepsCompleted = 16;
const wordsAPM = stepAPM.text.split(/\s+/).length;
const wordsBeforeAPM = step9.text.split(/\s+/).length;
const pctAPM = Math.round((1 - wordsAPM / wordsBeforeAPM) * 100);
console.log(` APM: ${wordsBeforeAPM}${wordsAPM} words (${pctAPM}%) — precision cut done`);
// ═══ STEP 17: Viral Signal — FLEXOPTIX Social Masterfile transformation ═══
// Applies AVC (Auto-Viral-Check), ASS (Auto-Signal-Score), carry line enforcement,
// auto-kill phrase filter, and generates LinkedIn post in one pass.
console.log(" Step 17/18: Viral Signal (Social Masterfile transformation)...");
setProgress(draftId, 17, "Step 17/18: Viral Signal");
let viralArticle = stepAPM.text;
let viralLinkedinPost: string | null = null;
try { try {
const stepViral = await generate(systemPrompt, const stepLinkedIn = await generate(systemPrompt,
VIRAL_SIGNAL_PROMPT + "\n\nArticle:\n" + stepAPM.text, STEP_LINKEDIN_POST.replace("{{ARTICLE}}", step9.text),
{ temperature: 0.5, maxTokens: 8192, timeoutMs: 480000 } { temperature: 0.6, maxTokens: 1024, timeoutMs: 120000 }
); );
const viralOutput = stepViral.text.trim(); linkedinPost = stepLinkedIn.text.trim();
// Parse output: article + ---LINKEDIN--- + linkedin post
const linkedinSep = viralOutput.indexOf("---LINKEDIN---");
if (linkedinSep !== -1) {
viralArticle = viralOutput.slice(0, linkedinSep).trim();
viralLinkedinPost = viralOutput.slice(linkedinSep + "---LINKEDIN---".length).trim();
console.log(` Viral Signal: article ${viralArticle.split(/\s+/).length} words + LinkedIn ${viralLinkedinPost.length} chars`);
} else {
// No separator — treat entire output as article
viralArticle = viralOutput;
console.log(` Viral Signal: article ${viralArticle.split(/\s+/).length} words (no LinkedIn section)`);
}
// Validate viral output isn't too short (LLM may have over-cut)
if (viralArticle.split(/\s+/).length < 400) {
console.log(" ⚠ Viral Signal output too short — falling back to APM output");
viralArticle = stepAPM.text;
}
} catch {
console.log(" Viral Signal skipped (error) — using APM output");
}
stepsCompleted = 17;
// ═══ STEP 18: LinkedIn Post ═══
// Use Viral Signal LinkedIn if available, otherwise generate via STEP_LINKEDIN_POST
console.log(" Step 18/18: LinkedIn Post (max 2,800 chars)...");
setProgress(draftId, 18, "Step 18/18: LinkedIn Post");
let linkedinPost: string | null = viralLinkedinPost;
let linkedinCharCount: number | null = viralLinkedinPost ? viralLinkedinPost.length : null;
if (!linkedinPost) {
// Fallback: dedicated LinkedIn post generator
try {
const stepLinkedIn = await generate(systemPrompt,
STEP_LINKEDIN_POST
.replace("{{TITLE_CONTRACT}}", titleContract)
.replace("{{ARTICLE}}", viralArticle),
{ temperature: 0.6, maxTokens: 1024, timeoutMs: 120000 }
);
linkedinPost = stepLinkedIn.text.trim();
linkedinCharCount = linkedinPost.length;
} catch {
console.log(" LinkedIn post generation skipped");
}
}
// Enforce hard limit — truncate at last sentence before 2800 if too long
if (linkedinPost && linkedinPost.length > 2800) {
linkedinPost = linkedinPost.slice(0, 2800).replace(/[^.!?]*$/, "").trim();
linkedinCharCount = linkedinPost.length; linkedinCharCount = linkedinPost.length;
console.log(` LinkedIn post truncated to ${linkedinCharCount} chars`); // Enforce hard limit — truncate at last sentence before 2800 if too long
} else if (linkedinPost) { if (linkedinCharCount > 2800) {
console.log(` LinkedIn post: ${linkedinCharCount} chars`); linkedinPost = linkedinPost.slice(0, 2800).replace(/[^.!?]*$/, "").trim();
} linkedinCharCount = linkedinPost.length;
stepsCompleted = 18; console.log(` LinkedIn post truncated to ${linkedinCharCount} chars`);
// ═══ STEP 19: Technical Sanity Check ═══
console.log(" Step 19/21: Technical Sanity Check...");
setProgress(draftId, 19, "Step 19/21: Technical Sanity Check");
let sanityReport = "";
try {
const stepSanity = await generate(systemPrompt,
STEP_TECHNICAL_SANITY.replace("{{ARTICLE}}", viralArticle),
{ temperature: 0.2, maxTokens: 4096, timeoutMs: 240000 }
);
sanityReport = stepSanity.text.trim();
console.log(` Sanity check: ${sanityReport.includes('"safe_to_publish": false') ? "⚠ ISSUES FOUND" : "✓ safe"}`);
} catch {
console.log(" Technical sanity check skipped (error)");
}
stepsCompleted = 19;
// ═══ STEP 20: Self-Heal (fix technical errors) ═══
if (sanityReport && (sanityReport.includes('"safe_to_publish": false') || sanityReport.includes('"critical_issues"'))) {
console.log(" Step 20/21: Self-Heal (fixing technical errors)...");
setProgress(draftId, 20, "Step 20/21: Self-Heal (technical fixes)");
try {
const stepHeal = await generate(systemPrompt,
STEP_SELF_HEAL
.replace("{{SANITY_REPORT}}", sanityReport)
.replace("{{ARTICLE}}", viralArticle),
LLM_REFINE
);
const healedWords = stepHeal.text.split(/\s+/).length;
if (healedWords > 400) {
viralArticle = stepHeal.text;
console.log(` Self-healed: ${healedWords} words`);
} else {
console.log(" Self-heal output too short — keeping original");
}
} catch {
console.log(" Self-heal skipped (error)");
}
} else {
console.log(" Step 20/21: Self-Heal skipped (no critical issues)");
}
stepsCompleted = 20;
// ═══ STEP 21: Title Contract Verification ═══
console.log(" Step 21/21: Title Contract Verification...");
setProgress(draftId, 21, "Step 21/21: Title Contract Check");
try {
const stepContract = await generate(systemPrompt,
STEP_TITLE_CONTRACT_CHECK
.replace("{{TITLE_CONTRACT}}", titleContract)
.replace("{{ARTICLE}}", viralArticle),
{ temperature: 0.2, maxTokens: 2048, timeoutMs: 120000 }
);
const contractResult = stepContract.text.trim();
if (contractResult.includes('"contract_fulfilled": false') || contractResult.includes('"REJECT')) {
console.log(" ⚠ TITLE CONTRACT VIOLATION — article may not match headline");
} else { } else {
console.log(" ✓ Title contract fulfilled"); console.log(` LinkedIn post: ${linkedinCharCount} chars`);
} }
} catch { } catch {
console.log(" Title contract check skipped (error)"); console.log(" LinkedIn post generation skipped");
} }
stepsCompleted = 21; stepsCompleted = 16;
// Extract article from Viral Signal output (or APM fallback) // Extract only the article from STEP9 output (QA returns review + fixed article)
// Fall back to step9.text if output looks too short or empty // Look for "COMPLETE FIXED ARTICLE" marker and take everything after it
let finalArticleText = viralArticle.trim().length > 200 ? viralArticle : step9.text; let finalArticleText = step9.text;
const articleMarkers = [ const articleMarkers = [
"### COMPLETE FIXED ARTICLE", "### COMPLETE FIXED ARTICLE",
"## COMPLETE FIXED ARTICLE", "## COMPLETE FIXED ARTICLE",
@ -1451,16 +1291,13 @@ async function runLlmPipeline(
"---\n\n**You're", "---\n\n**You're",
"---\n\nYou're", "---\n\nYou're",
]; ];
// Also check step9 for QA markers (APM may have stripped them already)
for (const marker of articleMarkers) { for (const marker of articleMarkers) {
const idx = step9.text.indexOf(marker); const idx = step9.text.indexOf(marker);
if (idx !== -1) { if (idx !== -1) {
// Skip past the marker line itself
const afterMarker = step9.text.slice(idx + marker.length).trimStart(); const afterMarker = step9.text.slice(idx + marker.length).trimStart();
const extractedFromQA = afterMarker.replace(/^---\s*\n/, "").trimStart(); // Strip leading --- separator if present
// Only use QA extraction if it's meaningfully longer than APM output finalArticleText = afterMarker.replace(/^---\s*\n/, "").trimStart();
if (extractedFromQA.split(/\s+/).length > finalArticleText.split(/\s+/).length * 0.8) {
finalArticleText = extractedFromQA;
}
break; break;
} }
} }
@ -1475,29 +1312,18 @@ async function runLlmPipeline(
const wordCount = draftContent.split(/\s+/).length; const wordCount = draftContent.split(/\s+/).length;
const finalIssues = validateArticle(draftContent); const finalIssues = validateArticle(draftContent);
// Hard minimum word count gate (1200 for LLM pipeline) // Update the draft in DB
if (wordCount < 1200) {
const shortMsg = `⚠ WORD COUNT FAIL: ${wordCount} words — minimum 1200 for LLM pipeline`;
console.log(` ${shortMsg}`);
if (!finalIssues.includes(`Too short: ${wordCount} words`)) {
finalIssues.push(`Too short: ${wordCount} words (minimum 1200 for LLM pipeline — article needs expansion)`);
}
} else {
console.log(` ✓ Word count: ${wordCount} words (≥1200 — OK)`);
}
// Update the draft in DB — promote to 'ready' on full pipeline completion
await pool.query( await pool.query(
`UPDATE blog_drafts `UPDATE blog_drafts
SET draft_content = $1, word_count = $2, SET draft_content = $1, word_count = $2,
generated_by = 'fo-blog-engine-v7', generated_by = 'fo-blog-engine-v5-autokill',
pipeline_version = 'v7-viral-signal', pipeline_version = 'v5-auto-kill-layer',
pipeline_steps_completed = $3, pipeline_steps_completed = $3,
auto_qa_score = $4, auto_qa_score = $4,
outline = $5, outline = $5,
linkedin_post = $6, linkedin_post = $6,
linkedin_char_count = $7, linkedin_char_count = $7,
status = 'review', status = 'draft',
updated_at = NOW() updated_at = NOW()
WHERE id = $8::uuid`, WHERE id = $8::uuid`,
[ [
@ -1506,7 +1332,8 @@ async function runLlmPipeline(
stepsCompleted, stepsCompleted,
autoQaScore ? JSON.stringify(autoQaScore) : null, autoQaScore ? JSON.stringify(autoQaScore) : null,
JSON.stringify({ JSON.stringify({
generation_method: "fo-pipeline-v5", generation_method: "fo-pipeline-v5-autokill",
auto_kill_scores: autoKillScores,
steps_completed: stepsCompleted, steps_completed: stepsCompleted,
blog_type: selectedTopic, blog_type: selectedTopic,
quality_issues: finalIssues, quality_issues: finalIssues,
@ -1535,13 +1362,13 @@ async function runLlmPipeline(
} }
clearProgress(draftId); clearProgress(draftId);
console.log(`Blog FO Pipeline: ${draftId} complete — ${wordCount} words, ${stepsCompleted}/18 steps, QA: ${(autoQaScore as any)?.overall || "N/A"}/10, LinkedIn: ${linkedinCharCount ?? "n/a"} chars`); console.log(`Blog FO Pipeline: ${draftId} complete — ${wordCount} words, ${stepsCompleted}/14 steps, QA: ${(autoQaScore as any)?.overall || "N/A"}/10, LinkedIn: ${linkedinCharCount ?? "n/a"} chars`);
} catch (llmErr) { } catch (llmErr) {
clearProgress(draftId); clearProgress(draftId);
console.warn(`Blog FO Pipeline failed at step ${stepsCompleted + 1}/18 for ${draftId}: ${(llmErr as Error).message}`); console.warn(`Blog FO Pipeline failed at step ${stepsCompleted + 1}/14 for ${draftId}: ${(llmErr as Error).message}`);
// Update with partial progress // Update with partial progress
await pool.query( await pool.query(
`UPDATE blog_drafts SET pipeline_steps_completed = $1, pipeline_version = 'v7-viral-signal', `UPDATE blog_drafts SET pipeline_steps_completed = $1, pipeline_version = 'v5-narrative-control',
outline = $2, updated_at = NOW() WHERE id = $3::uuid`, outline = $2, updated_at = NOW() WHERE id = $3::uuid`,
[stepsCompleted, JSON.stringify({ error: (llmErr as Error).message, steps_completed: stepsCompleted }), draftId] [stepsCompleted, JSON.stringify({ error: (llmErr as Error).message, steps_completed: stepsCompleted }), draftId]
).catch(() => {}); ).catch(() => {});
@ -1550,8 +1377,7 @@ async function runLlmPipeline(
// POST /api/blog/generate — Generate a new blog draft (returns immediately, LLM runs async) // POST /api/blog/generate — Generate a new blog draft (returns immediately, LLM runs async)
blogRouter.post("/generate", async (req: Request, res: Response) => { blogRouter.post("/generate", async (req: Request, res: Response) => {
const { title: reqTitle, topic, speed, form_factor, use_case, use_llm } = req.body as { const { topic, speed, form_factor, use_case, use_llm } = req.body as {
title?: string;
topic?: string; topic?: string;
speed?: string; speed?: string;
form_factor?: string; form_factor?: string;
@ -1574,14 +1400,11 @@ blogRouter.post("/generate", async (req: Request, res: Response) => {
const year = new Date().getFullYear(); const year = new Date().getFullYear();
const template = templates[Math.floor(Math.random() * templates.length)]; const template = templates[Math.floor(Math.random() * templates.length)];
// Use caller-provided title if given; fall back to template title const title = template.title
const title = (reqTitle && reqTitle.trim()) .replace("{YEAR}", String(year))
? reqTitle.trim() .replace("{SPEED}", speed || "400G/800G")
: template.title .replace("{FORM_FACTOR}", form_factor || "QSFP-DD/OSFP")
.replace("{YEAR}", String(year)) .replace("{USE_CASE}", use_case || "Data Center Interconnect");
.replace("{SPEED}", speed || "400G/800G")
.replace("{FORM_FACTOR}", form_factor || "QSFP-DD/OSFP")
.replace("{USE_CASE}", use_case || "Data Center Interconnect");
const keywords = [ const keywords = [
...template.seo_keywords, ...template.seo_keywords,
@ -1592,19 +1415,6 @@ blogRouter.post("/generate", async (req: Request, res: Response) => {
const data = await gatherBlogData(keywords, selectedTopic); const data = await gatherBlogData(keywords, selectedTopic);
// Clean up stale template drafts for the same title (idempotent regeneration)
// If a template draft already exists for this title, remove it before creating a fresh one
await pool.query(
`DELETE FROM blog_feedback WHERE blog_id IN (
SELECT id FROM blog_drafts WHERE title = $1 AND generated_by = 'tip-blog-engine-template'
)`,
[title]
).catch(() => {});
await pool.query(
`DELETE FROM blog_drafts WHERE title = $1 AND generated_by = 'tip-blog-engine-template'`,
[title]
).catch(() => {});
// Always create a template draft first (instant response) // Always create a template draft first (instant response)
const draftContent = generateTemplateDraft(title, selectedTopic, data); const draftContent = generateTemplateDraft(title, selectedTopic, data);
const wordCount = draftContent.split(/\s+/).length; const wordCount = draftContent.split(/\s+/).length;
@ -1682,7 +1492,7 @@ blogRouter.post("/generate", async (req: Request, res: Response) => {
blogRouter.get("/", async (_req: Request, res: Response) => { blogRouter.get("/", async (_req: Request, res: Response) => {
try { try {
const result = await pool.query( const result = await pool.query(
`SELECT id, title, topic, target_audience, status, word_count, seo_keywords, generated_by, created_at, linkedin_post `SELECT id, title, topic, target_audience, status, word_count, seo_keywords, generated_by, created_at
FROM blog_drafts FROM blog_drafts
ORDER BY created_at DESC ORDER BY created_at DESC
LIMIT 50`, LIMIT 50`,
@ -1700,11 +1510,10 @@ blogRouter.get("/llm/status", async (_req: Request, res: Response) => {
res.json({ success: true, queue_depth: getQueueDepth(), llm: health }); res.json({ success: true, queue_depth: getQueueDepth(), llm: health });
}); });
// POST /api/blog/llm/reset-queue — Force-reset stuck Ollama or Claude queue // POST /api/blog/llm/reset-queue — Force-reset stuck Ollama queue
blogRouter.post("/llm/reset-queue", (_req: Request, res: Response) => { blogRouter.post("/llm/reset-queue", (_req: Request, res: Response) => {
resetOllamaQueue(); resetOllamaQueue();
resetClaudeQueue(); res.json({ success: true, message: "Ollama queue reset — stuck requests cleared" });
res.json({ success: true, message: "LLM queues reset — stuck requests cleared (Ollama + Claude)" });
}); });
// GET /api/blog/:id — Get a specific draft with full content // GET /api/blog/:id — Get a specific draft with full content
@ -1712,7 +1521,7 @@ blogRouter.post("/llm/reset-queue", (_req: Request, res: Response) => {
blogRouter.get("/:id/progress", (req: Request, res: Response) => { blogRouter.get("/:id/progress", (req: Request, res: Response) => {
const p = pipelineProgress.get(String(req.params.id)); const p = pipelineProgress.get(String(req.params.id));
if (!p) { if (!p) {
res.json({ success: true, running: false, step: 0, total: 18, label: "Idle", pct: 0 }); res.json({ success: true, running: false, step: 0, total: 10, label: "Idle", pct: 0 });
return; return;
} }
res.json({ success: true, running: true, ...p }); res.json({ success: true, running: true, ...p });
@ -1880,94 +1689,6 @@ blogRouter.post("/:id/regenerate", async (req: Request, res: Response) => {
}); });
// DELETE /api/blog/:id — Delete a blog draft // DELETE /api/blog/:id — Delete a blog draft
// POST /api/blog/:id/publish-ghost — Publish to blog.fichtmueller.org via Ghost Admin API
blogRouter.post("/:id/publish-ghost", async (req: Request, res: Response) => {
try {
const draft = await pool.query(
"SELECT id, title, draft_content, seo_keywords FROM blog_drafts WHERE id = $1::uuid",
[req.params.id]
);
if (draft.rows.length === 0) {
return res.status(404).json({ success: false, error: "Draft not found" });
}
const { title, draft_content, seo_keywords } = draft.rows[0];
if (!draft_content || draft_content.trim().length < 100) {
return res.status(400).json({ success: false, error: "Draft content too short to publish" });
}
// Ghost Admin API JWT auth
const GHOST_URL = process.env.GHOST_URL || "https://blog.fichtmueller.org";
const GHOST_ADMIN_KEY = process.env.GHOST_ADMIN_KEY || "87727de2746a4de69efd5b03:7abdbec3a7ae473ad09487fc6e48327809da27c8adaaea457cce2d4f55b065f7";
const [keyId, secret] = GHOST_ADMIN_KEY.split(":");
// Create JWT token for Ghost Admin API
const crypto = await import("crypto");
const header = Buffer.from(JSON.stringify({ alg: "HS256", typ: "JWT", kid: keyId })).toString("base64url");
const now = Math.floor(Date.now() / 1000);
const payload = Buffer.from(JSON.stringify({
iat: now, exp: now + 300, aud: "/admin/"
})).toString("base64url");
const signature = crypto.createHmac("sha256", Buffer.from(secret, "hex"))
.update(`${header}.${payload}`).digest("base64url");
const jwt = `${header}.${payload}.${signature}`;
// Convert markdown content to Ghost mobiledoc
// Strip the # Title from content (Ghost uses its own title field)
const bodyContent = draft_content.replace(/^#\s+[^\n]+\n*/m, "").trim();
// Build mobiledoc with markdown card
const mobiledoc = JSON.stringify({
version: "0.3.1",
ghostVersion: "4.0",
markups: [], atoms: [], sections: [[10, 0]], cards: [["markdown", { markdown: bodyContent }]]
});
// Build tags from seo_keywords
const tags = (seo_keywords || "").split(",").map((k: string) => k.trim()).filter(Boolean).slice(0, 5)
.map((t: string) => ({ name: t }));
// POST to Ghost Admin API
const ghostRes = await fetch(`${GHOST_URL}/ghost/api/admin/posts/?source=html`, {
method: "POST",
headers: {
"Content-Type": "application/json",
"Accept-Version": "v5.0",
Authorization: `Ghost ${jwt}`,
},
body: JSON.stringify({
posts: [{
title,
mobiledoc,
status: "published",
tags: tags.length > 0 ? tags : [{ name: "Optical Networking" }],
}]
}),
});
if (!ghostRes.ok) {
const errBody = await ghostRes.text();
console.error("[blog] Ghost publish failed:", ghostRes.status, errBody.slice(0, 300));
return res.status(500).json({ success: false, error: `Ghost API error: ${ghostRes.status}` });
}
const ghostData = await ghostRes.json() as { posts?: Array<{ url?: string; slug?: string }> };
const ghostUrl = ghostData.posts?.[0]?.url || `${GHOST_URL}/`;
// Update TIP draft status
await pool.query(
"UPDATE blog_drafts SET status = 'published', updated_at = NOW() WHERE id = $1::uuid",
[req.params.id]
);
console.log(`[blog] Published to Ghost: ${title}${ghostUrl}`);
res.json({ success: true, url: ghostUrl, ghost_slug: ghostData.posts?.[0]?.slug });
} catch (err) {
console.error("[blog] Ghost publish error:", err);
res.status(500).json({ success: false, error: (err as Error).message });
}
});
blogRouter.delete("/:id", async (req: Request, res: Response) => { blogRouter.delete("/:id", async (req: Request, res: Response) => {
try { try {
// Delete feedback first (FK constraint) // Delete feedback first (FK constraint)

View File

@ -27,6 +27,9 @@
* tsx src/index.ts --switch-crawl-pw Crawl switch assets (Playwright, JS-heavy vendors) * tsx src/index.ts --switch-crawl-pw Crawl switch assets (Playwright, JS-heavy vendors)
* tsx src/index.ts --fetch-only Run only fetch-based scrapers (no Playwright) * tsx src/index.ts --fetch-only Run only fetch-based scrapers (no Playwright)
* tsx src/index.ts --atgbics Run ATGBICS scraper once * tsx src/index.ts --atgbics Run ATGBICS scraper once
* tsx src/index.ts --naddod Run NADDOD scraper once
* tsx src/index.ts --qsfptek Run QSFPTEK scraper once
* tsx src/index.ts --addon Run AddOn Networks scraper once
*/ */
import { createScheduler, registerSchedules, registerWorkers } from "./scheduler"; import { createScheduler, registerSchedules, registerWorkers } from "./scheduler";
import { scrapeFs } from "./scrapers/fs-com"; import { scrapeFs } from "./scrapers/fs-com";
@ -54,6 +57,9 @@ import { crawlSwitchAssets } from "./scrapers/switch-assets-crawler";
import { crawlSwitchAssetsPlaywright } from "./scrapers/switch-assets-playwright"; import { crawlSwitchAssetsPlaywright } from "./scrapers/switch-assets-playwright";
import { scrapeAtgbics } from "./scrapers/atgbics"; import { scrapeAtgbics } from "./scrapers/atgbics";
import { scrapeProLabs } from "./scrapers/prolabs"; import { scrapeProLabs } from "./scrapers/prolabs";
import { scrapeNaddod } from "./scrapers/naddod";
import { scrapeQsfptek } from "./scrapers/qsfptek";
import { scrapeAddonNetworks } from "./scrapers/addon-networks";
import { pool } from "./utils/db"; import { pool } from "./utils/db";
const args = process.argv.slice(2); const args = process.argv.slice(2);
@ -86,6 +92,15 @@ async function runOnce(): Promise<void> {
if (args.includes("--prolabs") || isAll || isFetchOnly) { if (args.includes("--prolabs") || isAll || isFetchOnly) {
await scrapeProLabs(); await scrapeProLabs();
} }
if (args.includes("--naddod") || isAll || isFetchOnly) {
await scrapeNaddod();
}
if (args.includes("--qsfptek") || isAll || isFetchOnly) {
await scrapeQsfptek();
}
if (args.includes("--addon") || isAll || isFetchOnly) {
await scrapeAddonNetworks();
}
if (args.includes("--juniper") || isAll || isFetchOnly) { if (args.includes("--juniper") || isAll || isFetchOnly) {
await scrapeJuniperHct(); await scrapeJuniperHct();
} }
@ -172,7 +187,7 @@ async function runScheduler(): Promise<void> {
process.on("SIGTERM", shutdown); process.on("SIGTERM", shutdown);
} }
const ALL_FLAGS = ["--all", "--fs", "--cisco", "--optcore", "--news", "--flexoptix", "--vendors", "--10gtek", "--champion", "--fluxlight", "--sfpcables", "--gbics", "--prolabs", "--juniper", "--switches", "--whitebox", "--switches-ext", "--flexoptix-vendors", "--sonic-hcl", "--edgecore", "--ufispace", "--switch-assets", "--switch-crawl", "--switch-crawl-pw", "--fetch-only", "--atgbics"]; const ALL_FLAGS = ["--all", "--fs", "--cisco", "--optcore", "--news", "--flexoptix", "--vendors", "--10gtek", "--champion", "--fluxlight", "--sfpcables", "--gbics", "--prolabs", "--naddod", "--qsfptek", "--addon", "--juniper", "--switches", "--whitebox", "--switches-ext", "--flexoptix-vendors", "--sonic-hcl", "--edgecore", "--ufispace", "--switch-assets", "--switch-crawl", "--switch-crawl-pw", "--fetch-only", "--atgbics"];
if (args.some((a) => ALL_FLAGS.includes(a))) { if (args.some((a) => ALL_FLAGS.includes(a))) {
runOnce().catch((err) => { runOnce().catch((err) => {

View File

@ -36,6 +36,7 @@ async function withIsolatedStorage(name: string, fn: () => Promise<void>): Promi
await fn(); await fn();
} finally { } finally {
process.env.CRAWLEE_STORAGE_DIR = prev ?? ""; process.env.CRAWLEE_STORAGE_DIR = prev ?? "";
// Clean up after successful run
try { rmSync(dir, { recursive: true, force: true }); } catch { /* ignore */ } try { rmSync(dir, { recursive: true, force: true }); } catch { /* ignore */ }
} }
} }

View File

@ -0,0 +1,303 @@
/**
* AddOn Networks Scraper US-based compatible optics vendor
*
* addnetworks.com Enterprise-grade compatible transceivers.
* Products browseable under /products/ category pages.
* Pricing is public in USD. Rate limited: 1 req/2sec.
*
* AddOn Networks (AddOn Computer Products) specializes in OEM-compatible
* optics for Cisco, Juniper, Arista, HPE, and Dell environments.
* ~2500 SKUs, strong US channel presence.
*/
import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db";
import { contentHash } from "../utils/hash";
const BASE = "https://www.addnetworks.com";
const HEADERS = {
"User-Agent": "Mozilla/5.0 (compatible; TIP-Bot/1.0; research)",
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
};
const MAX_PAGES = 50;
// AddOn uses "compatible" suffix naming (e.g. "ADD-XSSFP10GE-LR-AO")
// Categories follow standard form-factor taxonomy
const CATEGORIES = [
{ path: "/products/networking/optical-networking/sfp/", formFactor: "SFP", speed: "1G", speedGbps: 1 },
{ path: "/products/networking/optical-networking/sfp-plus/", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
{ path: "/products/networking/optical-networking/sfp28/", formFactor: "SFP28", speed: "25G", speedGbps: 25 },
{ path: "/products/networking/optical-networking/qsfp-plus/", formFactor: "QSFP+", speed: "40G", speedGbps: 40 },
{ path: "/products/networking/optical-networking/qsfp28/", formFactor: "QSFP28", speed: "100G", speedGbps: 100 },
{ path: "/products/networking/optical-networking/qsfp-dd/", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 },
// Broader category fallback
{ path: "/products/networking/optical-networking/", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
];
interface Product {
partNumber: string;
name: string;
url: string;
price?: number;
formFactor: string;
speed: string;
speedGbps: number;
reachLabel?: string;
reachMeters?: number;
fiberType?: string;
wavelength?: string;
compatibleWith?: string;
}
function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
function detectReach(text: string): { label: string; meters: number } | undefined {
const patterns: [RegExp, string, number][] = [
[/\b120\s*km\b/i, "120km", 120000],
[/\b80\s*km\b/i, "80km", 80000],
[/\b40\s*km\b/i, "40km", 40000],
[/\b20\s*km\b/i, "20km", 20000],
[/\b10\s*km\b/i, "10km", 10000],
[/\b2\s*km\b/i, "2km", 2000],
[/\b550\s*m\b/i, "550m", 550],
[/\b500\s*m\b/i, "500m", 500],
[/\b400\s*m\b/i, "400m", 400],
[/\b300\s*m\b/i, "300m", 300],
[/\b150\s*m\b/i, "150m", 150],
[/\b100\s*m\b/i, "100m", 100],
[/\bLR4\b/, "10km", 10000],
[/\bLR\b/, "10km", 10000],
[/\bER4?\b/, "40km", 40000],
[/\bZR4?\b/, "80km", 80000],
[/\bSR4?\b/, "300m", 300],
[/\bDR4?\b/, "500m", 500],
[/\bFR4?\b/, "2km", 2000],
];
for (const [regex, label, meters] of patterns) {
if (regex.test(text)) return { label, meters };
}
return undefined;
}
function detectFiber(text: string): string {
if (/single.?mode|smf|[^a-z]lx[^a-z]|[^a-z]lr[^a-z]|[^a-z]er[^a-z]|[^a-z]zr[^a-z]|bidi|cwdm|dwdm/i.test(text)) return "SMF";
if (/multi.?mode|mmf|[^a-z]sx[^a-z]|[^a-z]sr[^a-z]/i.test(text)) return "MMF";
if (/copper|dac|twinax|rj.?45|base-t/i.test(text)) return "Copper";
return "";
}
function detectWavelength(text: string): string {
const match = text.match(/(\d{3,4})\s*nm/i);
return match ? match[1] : "";
}
function extractCompatibleVendor(name: string): string {
const brands = ["Cisco", "Juniper", "Arista", "HPE", "HP", "Aruba", "Dell", "Brocade", "Extreme",
"Huawei", "Nokia", "MikroTik", "Mellanox", "Nvidia", "Ubiquiti", "Force10",
"Foundry", "Enterasys", "Allied Telesis", "Netgear", "Calix"];
for (const brand of brands) {
if (new RegExp(`\\b${brand}\\b`, "i").test(name)) return brand;
}
// AddOn naming convention: "FOR-XX" suffix
const forMatch = name.match(/-AO$|-IN$/i);
if (forMatch) {
// Check preceding OEM part number pattern, e.g. SFP-10G-SR-AO → Cisco
if (/^SFP-|^GLC-|^QSFP-|^SFP28-/i.test(name)) return "Cisco";
if (/^EX-|^QFX-/i.test(name)) return "Juniper";
if (/^740-|^J\d{4}/i.test(name)) return "Juniper";
}
return "";
}
/**
* Parse AddOn Networks product listing HTML.
* Supports multiple CMS patterns (Magento, BigCommerce, custom).
*/
function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product[] {
const products: Product[] = [];
const seen = new Set<string>();
const collapsed = html.replace(/\s+/g, " ");
// Strategy 1: Magento / standard product grid
for (const m of collapsed.matchAll(/<li[^>]+class="[^"]*product[^"]*"[^>]*>([\s\S]*?)<\/li>/gi)) {
const card = m[1];
const urlMatch = card.match(/href="(https?:\/\/(?:www\.)?addnetworks\.com\/[^"?#]+)"/i);
if (!urlMatch) continue;
const url = urlMatch[1];
if (seen.has(url) || !/\/product(?:s)?\/|\/item\//i.test(url)) continue;
seen.add(url);
const nameMatch = card.match(/<h[2-4][^>]*>([^<]{10,})<\/h[2-4]>/i) ||
card.match(/product[_-]?(?:name|title)[^>]*>([^<]{10,})</i) ||
card.match(/class="name[^"]*"[^>]*>([^<]{10,})</i);
if (!nameMatch) continue;
const name = nameMatch[1].trim().replace(/&amp;/g, "&").replace(/&#[0-9]+;/g, "");
if (name.length < 5) continue;
const priceMatch = card.match(/\$\s*([\d,]+\.?\d*)/);
const price = priceMatch ? parseFloat(priceMatch[1].replace(/,/g, "")) : undefined;
const reach = detectReach(name);
// AddOn part numbers end in "-AO" or "-IN" suffix
const partNumber = name.match(/([A-Z0-9](?:[A-Z0-9\-\.\/]{4,}(?:-AO|-IN|-ADD)?))/)?.[1] ||
name.split(/\s+/)[0]?.slice(0, 80) || name.slice(0, 60);
products.push({
partNumber, name, url,
price: price && price > 0 && price < 100000 ? price : undefined,
formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps,
reachLabel: reach?.label, reachMeters: reach?.meters,
fiberType: detectFiber(name), wavelength: detectWavelength(name),
compatibleWith: extractCompatibleVendor(name),
});
}
// Strategy 2: Generic product link fallback using matchAll
if (products.length === 0) {
for (const m of collapsed.matchAll(/href="(https?:\/\/(?:www\.)?addnetworks\.com\/[^"?#]+)"[^>]*>\s*<[^>]+>\s*([^<]{10,})/gi)) {
const url = m[1];
const name = m[2].trim().replace(/&amp;/g, "&");
if (seen.has(url) || name.length < 10) continue;
if (!/transceiver|sfp|qsfp|osfp|dac|aoc|fiber|optical/i.test(name)) continue;
seen.add(url);
const idx = collapsed.indexOf(url);
const ctx = collapsed.slice(Math.max(0, idx - 300), idx + 600);
const priceM = ctx.match(/\$\s*([\d,]+\.?\d*)/);
const price = priceM ? parseFloat(priceM[1].replace(/,/g, "")) : undefined;
const reach = detectReach(name);
products.push({
partNumber: name.match(/([A-Z0-9][A-Z0-9\-\.\/]{4,})/)?.[1] || name.split(/\s+/)[0]?.slice(0, 80) || "",
name, url,
price: price && price > 0 && price < 100000 ? price : undefined,
formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps,
reachLabel: reach?.label, reachMeters: reach?.meters,
fiberType: detectFiber(name), wavelength: detectWavelength(name),
compatibleWith: extractCompatibleVendor(name),
});
}
}
return products;
}
async function fetchPage(url: string): Promise<string> {
const resp = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(30000) });
if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`);
return resp.text();
}
export async function scrapeAddonNetworks(): Promise<void> {
console.log("=== AddOn Networks Scraper Starting ===\n");
const vendorId = await ensureVendor(
"AddOn Networks",
"compatible",
"https://www.addnetworks.com",
"https://www.addnetworks.com/products/networking/optical-networking/",
);
let totalProducts = 0;
let priceUpdates = 0;
const seenCategories = new Set<string>();
for (const cat of CATEGORIES) {
console.log(`\n--- ${cat.formFactor} (${cat.speed}) [${cat.path}] ---`);
try {
const html1 = await fetchPage(BASE + cat.path);
const catProducts = parseProductList(html1, cat);
if (cat.path === "/products/networking/optical-networking/" && seenCategories.size > 3) {
console.log(` Skipping generic fallback (${seenCategories.size} specific categories scraped)`);
continue;
}
if (catProducts.length === 0) {
console.log(" No products on page 1 — skipping");
continue;
}
seenCategories.add(cat.path);
console.log(` Found ${catProducts.length} products on page 1`);
// Detect pagination
const totalPagesMatch =
html1.match(/page\s+\d+\s+of\s+(\d+)/i) ||
html1.match(/aria-label="Last[^"]*"\s+href="[^"]*[?&]p=(\d+)/) ||
html1.match(/pagination[^>]*>[\s\S]*?(\d+)<\/a>\s*<\/[^>]+>\s*<\/[^>]+>/);
const totalPages = totalPagesMatch ? Math.min(parseInt(totalPagesMatch[1]), MAX_PAGES) : 2;
console.log(` Total pages (estimate): ${totalPages}`);
const allProducts = [...catProducts];
for (let page = 2; page <= totalPages; page++) {
await sleep(2000);
try {
const pageUrl = BASE + cat.path + `?p=${page}`;
const html = await fetchPage(pageUrl);
const pageProds = parseProductList(html, cat);
if (pageProds.length === 0) break;
allProducts.push(...pageProds);
console.log(` Page ${page}: ${pageProds.length} products`);
} catch (err) {
console.warn(` Page ${page} failed: ${(err as Error).message.slice(0, 60)}`);
break;
}
}
const uniqueProducts = allProducts.filter((p, i, arr) => arr.findIndex((x) => x.url === p.url) === i);
console.log(` Total unique: ${uniqueProducts.length}`);
for (const product of uniqueProducts) {
try {
const txId = await findOrCreateScrapedTransceiver({
partNumber: product.partNumber,
vendorId,
formFactor: product.formFactor,
speedGbps: product.speedGbps,
speed: product.speed,
reachMeters: product.reachMeters,
reachLabel: product.reachLabel,
fiberType: product.fiberType,
wavelengths: product.wavelength,
category: "DataCenter",
});
if (product.price && product.price > 0) {
const hash = contentHash({ price: product.price, part: product.partNumber });
const updated = await upsertPriceObservation({
transceiverId: txId,
sourceVendorId: vendorId,
price: product.price,
currency: "USD",
stockLevel: "in_stock",
url: product.url,
contentHash: hash,
});
if (updated) priceUpdates++;
}
totalProducts++;
} catch (err) {
console.warn(` DB error: ${(err as Error).message.slice(0, 80)}`);
}
}
} catch (err) {
console.error(` Category failed: ${(err as Error).message}`);
}
await sleep(2000);
}
console.log(`\n=== AddOn Networks Complete: ${totalProducts} products, ${priceUpdates} price updates ===`);
}
if (require.main === module) {
scrapeAddonNetworks()
.then(() => pool.end())
.catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); });
}

View File

@ -212,7 +212,7 @@ export async function scrapeChampionOne(): Promise<void> {
}); });
if (product.price && product.price > 0) { if (product.price && product.price > 0) {
const hash = contentHash(JSON.stringify({ price: product.price, part: product.partNumber })); const hash = contentHash({ price: product.price, part: product.partNumber });
const updated = await upsertPriceObservation({ const updated = await upsertPriceObservation({
transceiverId: txId, sourceVendorId: vendorId, transceiverId: txId, sourceVendorId: vendorId,
price: product.price, currency: product.currency || "USD", price: product.price, currency: product.currency || "USD",

View File

@ -99,19 +99,26 @@ function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product
// Collapse whitespace for easier regex matching // Collapse whitespace for easier regex matching
const collapsed = html.replace(/\s+/g, " "); const collapsed = html.replace(/\s+/g, " ");
// BigCommerce article card pattern (updated): // BigCommerce card-title pattern:
// <article data-name="Product Name" data-product-price="2395" ...> // <a aria-label="Product Name, £XX.XX" href="URL" data-event-type="product-click">
// <a href="https://www.gbics.com/product-slug/" ...> const productRegex = /aria-label="([^"]+)"\s+href="(https?:\/\/(?:www\.)?gbics\.com\/[^"]+)"[^>]*data-event-type="product-click"/gi;
// Price is in pence (integer), divide by 100 = GBP
const articleRegex = /data-name="([^"]{10,200})"[^>]*data-product-price="\s*(\d+)\s*"[^>]*>[\s\S]{0,500}?href="(https?:\/\/(?:www\.)?gbics\.com\/[^"]+)"/gi;
let match; let match;
while ((match = articleRegex.exec(collapsed)) !== null) { while ((match = productRegex.exec(collapsed)) !== null) {
const name = match[1].trim(); const label = match[1].trim();
const priceRaw = parseInt(match[2], 10); const url = match[2];
const url = match[3];
// GBICS stores price in pence (integer) — e.g. 2395 = £23.95 OR £2,395.00 (full pounds)? // aria-label contains "Product Name, £XX.XX"
// Check by data-price-asc context: "data-price-asc=\"2395\"" with "£2,395.00" → price is in full GBP (no pence) // Split on last comma to separate name and price
const price = priceRaw > 0 ? priceRaw : undefined; const priceInLabel = label.match(/,\s*£\s*([\d,.]+)\s*$/);
const name = priceInLabel ? label.slice(0, label.lastIndexOf(",")).trim() : label;
let price = priceInLabel ? parseFloat(priceInLabel[1].replace(",", "")) : undefined;
// Fallback: extract price from data-price-asc attribute on parent <li>
if (!price) {
const priceContext = collapsed.slice(Math.max(0, match.index - 500), match.index);
const dataPriceMatch = priceContext.match(/data-price-asc="(\d+)"/);
if (dataPriceMatch) price = parseFloat(dataPriceMatch[1]);
}
if (name.length < 10) continue; if (name.length < 10) continue;

View File

@ -0,0 +1,285 @@
/**
* NADDOD Scraper Chinese compatible transceiver vendor
*
* naddod.com WooCommerce store, server-rendered HTML, USD pricing.
* Products listed under product category pages.
* Pagination via /page/N/. Rate limited: 1 req/2sec.
*
* NADDOD (Shenzhen NADDOD Information Co.) makes and sells compatible
* optics for Cisco, Juniper, Arista, etc. Transparent USD pricing.
*/
import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db";
import { contentHash } from "../utils/hash";
const BASE = "https://www.naddod.com";
const HEADERS = {
"User-Agent": "Mozilla/5.0 (compatible; TIP-Bot/1.0; research)",
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
};
const MAX_PAGES = 30;
const CATEGORIES = [
{ path: "/product-category/1g-sfp-transceivers/", formFactor: "SFP", speed: "1G", speedGbps: 1 },
{ path: "/product-category/10g-sfp-transceivers/", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
{ path: "/product-category/25g-sfp28-transceivers/", formFactor: "SFP28", speed: "25G", speedGbps: 25 },
{ path: "/product-category/40g-qsfp-transceivers/", formFactor: "QSFP+", speed: "40G", speedGbps: 40 },
{ path: "/product-category/100g-qsfp28-transceivers/", formFactor: "QSFP28", speed: "100G", speedGbps: 100 },
{ path: "/product-category/200g-qsfp56-transceivers/", formFactor: "QSFP56", speed: "200G", speedGbps: 200 },
{ path: "/product-category/400g-qsfp-dd-transceivers/", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 },
{ path: "/product-category/800g-osfp-transceivers/", formFactor: "OSFP", speed: "800G", speedGbps: 800 },
{ path: "/product-category/transceivers/", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
];
interface Product {
partNumber: string;
name: string;
url: string;
price?: number;
formFactor: string;
speed: string;
speedGbps: number;
reachLabel?: string;
reachMeters?: number;
fiberType?: string;
wavelength?: string;
compatibleWith?: string;
}
function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
function detectReach(text: string): { label: string; meters: number } | undefined {
const patterns: [RegExp, string, number][] = [
[/\b120\s*km\b/i, "120km", 120000],
[/\b80\s*km\b/i, "80km", 80000],
[/\b40\s*km\b/i, "40km", 40000],
[/\b20\s*km\b/i, "20km", 20000],
[/\b10\s*km\b/i, "10km", 10000],
[/\b2\s*km\b/i, "2km", 2000],
[/\b550\s*m\b/i, "550m", 550],
[/\b500\s*m\b/i, "500m", 500],
[/\b400\s*m\b/i, "400m", 400],
[/\b300\s*m\b/i, "300m", 300],
[/\b150\s*m\b/i, "150m", 150],
[/\b100\s*m\b/i, "100m", 100],
[/\bLR4\b/, "10km", 10000],
[/\bLR\b/, "10km", 10000],
[/\bER4?\b/, "40km", 40000],
[/\bZR4?\b/, "80km", 80000],
[/\bSR4?\b/, "300m", 300],
[/\bDR4?\b/, "500m", 500],
[/\bFR4?\b/, "2km", 2000],
];
for (const [regex, label, meters] of patterns) {
if (regex.test(text)) return { label, meters };
}
return undefined;
}
function detectFiber(text: string): string {
if (/single.?mode|smf|[^a-z]lx[^a-z]|[^a-z]lr[^a-z]|[^a-z]er[^a-z]|[^a-z]zr[^a-z]|bidi|cwdm|dwdm/i.test(text)) return "SMF";
if (/multi.?mode|mmf|[^a-z]sx[^a-z]|[^a-z]sr[^a-z]/i.test(text)) return "MMF";
if (/copper|dac|twinax|rj.?45|base-t/i.test(text)) return "Copper";
return "";
}
function detectWavelength(text: string): string {
const match = text.match(/(\d{3,4})\s*nm/i);
return match ? match[1] : "";
}
function extractCompatibleVendor(name: string): string {
const brands = ["Cisco", "Juniper", "Arista", "HPE", "Dell", "Brocade", "Extreme", "Huawei",
"Nokia", "MikroTik", "Mellanox", "Nvidia", "Ubiquiti"];
for (const brand of brands) {
if (new RegExp(`\\b${brand}\\b`, "i").test(name)) return brand;
}
const match = name.match(/(?:for\s+|compatible\s+(?:with\s+)?)([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)/);
return match ? match[1] : "";
}
function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product[] {
const products: Product[] = [];
const seen = new Set<string>();
const collapsed = html.replace(/\s+/g, " ");
// Strategy 1: WooCommerce standard product loop
const cardRegex = /<li[^>]+class="[^"]*product[^"]*"[^>]*>([\s\S]*?)<\/li>/gi;
let cardMatch;
while ((cardMatch = cardRegex.exec(collapsed)) !== null) {
const card = cardMatch[1];
const urlMatch = card.match(/href="(https?:\/\/(?:www\.)?naddod\.com\/product\/[^"]+)"/i);
if (!urlMatch) continue;
const url = urlMatch[1];
if (seen.has(url)) continue;
seen.add(url);
const nameMatch = card.match(/woocommerce-loop-product__title[^>]*>([^<]+)</i) ||
card.match(/<h2[^>]*>([^<]{10,})<\/h2>/i) ||
card.match(/<h3[^>]*>([^<]{10,})<\/h3>/i);
if (!nameMatch) continue;
const name = nameMatch[1].trim().replace(/&amp;/g, "&").replace(/&#8211;/g, "");
if (name.length < 5) continue;
const priceMatch = card.match(/\$\s*([\d,]+\.?\d*)/);
const price = priceMatch ? parseFloat(priceMatch[1].replace(/,/g, "")) : undefined;
const reach = detectReach(name);
const partNumber = name.split(/\s+(?:compatible|for|sfp|qsfp)/i)[0]?.trim().slice(0, 80) || name.slice(0, 60);
products.push({
partNumber, name, url,
price: price && price > 0 && price < 100000 ? price : undefined,
formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps,
reachLabel: reach?.label, reachMeters: reach?.meters,
fiberType: detectFiber(name), wavelength: detectWavelength(name),
compatibleWith: extractCompatibleVendor(name),
});
}
// Strategy 2: Generic product link fallback
if (products.length === 0) {
const linkRegex = /href="(https?:\/\/(?:www\.)?naddod\.com\/(?:product|shop)\/[^"?#]+)"[^>]*>\s*([^<]{10,})/gi;
let m;
while ((m = linkRegex.exec(collapsed)) !== null) {
const url = m[1];
const name = m[2].trim().replace(/&amp;/g, "&");
if (seen.has(url) || name.length < 10) continue;
if (!/transceiver|sfp|qsfp|osfp|dac|aoc|xfp/i.test(name)) continue;
seen.add(url);
const ctx = collapsed.slice(Math.max(0, m.index - 200), m.index + 500);
const priceM = ctx.match(/\$\s*([\d,]+\.?\d*)/);
const price = priceM ? parseFloat(priceM[1].replace(/,/g, "")) : undefined;
const reach = detectReach(name);
products.push({
partNumber: name.split(/\s+/)[0]?.slice(0, 80) || "",
name, url,
price: price && price > 0 && price < 100000 ? price : undefined,
formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps,
reachLabel: reach?.label, reachMeters: reach?.meters,
fiberType: detectFiber(name), wavelength: detectWavelength(name),
compatibleWith: extractCompatibleVendor(name),
});
}
}
return products;
}
async function fetchPage(url: string): Promise<string> {
const resp = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(30000) });
if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`);
return resp.text();
}
export async function scrapeNaddod(): Promise<void> {
console.log("=== NADDOD Scraper Starting ===\n");
const vendorId = await ensureVendor(
"NADDOD",
"compatible",
"https://www.naddod.com",
"https://www.naddod.com/product-category/transceivers/",
);
let totalProducts = 0;
let priceUpdates = 0;
const seenCategories = new Set<string>();
for (const cat of CATEGORIES) {
console.log(`\n--- ${cat.formFactor} (${cat.speed}) [${cat.path}] ---`);
try {
const html1 = await fetchPage(BASE + cat.path);
const catProducts = parseProductList(html1, cat);
if (cat.path.includes("/transceivers/") && seenCategories.size > 3) {
console.log(` Skipping generic fallback (${seenCategories.size} specific categories scraped)`);
continue;
}
if (catProducts.length === 0) {
console.log(" No products on page 1 — skipping");
continue;
}
seenCategories.add(cat.path);
console.log(` Found ${catProducts.length} products on page 1`);
const totalPagesMatch = html1.match(/page-numbers[^>]*>(\d+)<\/a>(?!.*page-numbers)/);
const totalPages = totalPagesMatch ? Math.min(parseInt(totalPagesMatch[1]), MAX_PAGES) : 1;
console.log(` Total pages: ${totalPages}`);
const allProducts = [...catProducts];
for (let page = 2; page <= totalPages; page++) {
await sleep(2000);
try {
const html = await fetchPage(BASE + cat.path + `page/${page}/`);
const pageProds = parseProductList(html, cat);
if (pageProds.length === 0) break;
allProducts.push(...pageProds);
console.log(` Page ${page}: ${pageProds.length} products`);
} catch (err) {
console.warn(` Page ${page} failed: ${(err as Error).message.slice(0, 60)}`);
break;
}
}
const uniqueProducts = allProducts.filter((p, i, arr) => arr.findIndex((x) => x.url === p.url) === i);
console.log(` Total unique: ${uniqueProducts.length}`);
for (const product of uniqueProducts) {
try {
const txId = await findOrCreateScrapedTransceiver({
partNumber: product.partNumber,
vendorId,
formFactor: product.formFactor,
speedGbps: product.speedGbps,
speed: product.speed,
reachMeters: product.reachMeters,
reachLabel: product.reachLabel,
fiberType: product.fiberType,
wavelengths: product.wavelength,
category: "DataCenter",
});
if (product.price && product.price > 0) {
const hash = contentHash({ price: product.price, part: product.partNumber });
const updated = await upsertPriceObservation({
transceiverId: txId,
sourceVendorId: vendorId,
price: product.price,
currency: "USD",
stockLevel: "in_stock",
url: product.url,
contentHash: hash,
});
if (updated) priceUpdates++;
}
totalProducts++;
} catch (err) {
console.warn(` DB error: ${(err as Error).message.slice(0, 80)}`);
}
}
} catch (err) {
console.error(` Category failed: ${(err as Error).message}`);
}
await sleep(2000);
}
console.log(`\n=== NADDOD Complete: ${totalProducts} products, ${priceUpdates} price updates ===`);
}
if (require.main === module) {
scrapeNaddod()
.then(() => pool.end())
.catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); });
}

View File

@ -0,0 +1,281 @@
/**
* QSFPTEK Scraper Chinese compatible transceiver vendor
*
* qsfptek.com Server-rendered HTML shop, USD pricing.
* Focuses on QSFP+/QSFP28/QSFP-DD/SFP+ form factors.
* Rate limited: 1 req/2sec.
*
* QSFPTEK (Shenzhen Optotech Technology) competitive pricing,
* transparent USD prices, no account required.
*/
import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db";
import { contentHash } from "../utils/hash";
const BASE = "https://www.qsfptek.com";
const HEADERS = {
"User-Agent": "Mozilla/5.0 (compatible; TIP-Bot/1.0; research)",
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
};
const MAX_PAGES = 30;
const CATEGORIES = [
{ path: "/c/sfp-transceiver.html", formFactor: "SFP", speed: "1G", speedGbps: 1 },
{ path: "/c/sfp-plus-transceiver.html", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
{ path: "/c/sfp28-transceiver.html", formFactor: "SFP28", speed: "25G", speedGbps: 25 },
{ path: "/c/qsfp-plus-transceiver.html", formFactor: "QSFP+", speed: "40G", speedGbps: 40 },
{ path: "/c/qsfp28-transceiver.html", formFactor: "QSFP28", speed: "100G", speedGbps: 100 },
{ path: "/c/qsfp56-transceiver.html", formFactor: "QSFP56", speed: "200G", speedGbps: 200 },
{ path: "/c/qsfp-dd-transceiver.html", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 },
{ path: "/c/osfp-transceiver.html", formFactor: "OSFP", speed: "800G", speedGbps: 800 },
{ path: "/c/optical-transceiver.html", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
];
interface Product {
partNumber: string;
name: string;
url: string;
price?: number;
formFactor: string;
speed: string;
speedGbps: number;
reachLabel?: string;
reachMeters?: number;
fiberType?: string;
wavelength?: string;
compatibleWith?: string;
}
function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
function detectReach(text: string): { label: string; meters: number } | undefined {
const patterns: [RegExp, string, number][] = [
[/\b120\s*km\b/i, "120km", 120000],
[/\b80\s*km\b/i, "80km", 80000],
[/\b40\s*km\b/i, "40km", 40000],
[/\b20\s*km\b/i, "20km", 20000],
[/\b10\s*km\b/i, "10km", 10000],
[/\b2\s*km\b/i, "2km", 2000],
[/\b550\s*m\b/i, "550m", 550],
[/\b500\s*m\b/i, "500m", 500],
[/\b300\s*m\b/i, "300m", 300],
[/\b100\s*m\b/i, "100m", 100],
[/\bLR4\b/, "10km", 10000],
[/\bLR\b/, "10km", 10000],
[/\bER4?\b/, "40km", 40000],
[/\bZR4?\b/, "80km", 80000],
[/\bSR4?\b/, "300m", 300],
[/\bDR4?\b/, "500m", 500],
[/\bFR4?\b/, "2km", 2000],
];
for (const [regex, label, meters] of patterns) {
if (regex.test(text)) return { label, meters };
}
return undefined;
}
function detectFiber(text: string): string {
if (/single.?mode|smf|[^a-z]lx[^a-z]|[^a-z]lr[^a-z]|[^a-z]er[^a-z]|[^a-z]zr[^a-z]|bidi|cwdm|dwdm/i.test(text)) return "SMF";
if (/multi.?mode|mmf|[^a-z]sx[^a-z]|[^a-z]sr[^a-z]/i.test(text)) return "MMF";
if (/copper|dac|twinax|rj.?45|base-t/i.test(text)) return "Copper";
return "";
}
function detectWavelength(text: string): string {
const match = text.match(/(\d{3,4})\s*nm/i);
return match ? match[1] : "";
}
function extractCompatibleVendor(name: string): string {
const brands = ["Cisco", "Juniper", "Arista", "HPE", "Aruba", "Dell", "Brocade", "Extreme",
"Huawei", "Nokia", "MikroTik", "Mellanox", "Nvidia", "Ubiquiti", "Allied Telesis"];
for (const brand of brands) {
if (new RegExp(`\\b${brand}\\b`, "i").test(name)) return brand;
}
return "";
}
function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product[] {
const products: Product[] = [];
const seen = new Set<string>();
const collapsed = html.replace(/\s+/g, " ");
// Strategy 1: OpenCart / custom card layout using matchAll
for (const cardMatch of collapsed.matchAll(/<div[^>]+class="[^"]*product-(?:thumb|layout)[^"]*"[^>]*>([\s\S]*?)<\/div>\s*<\/div>/gi)) {
const card = cardMatch[1];
const urlMatch = card.match(/href="(https?:\/\/(?:www\.)?qsfptek\.com\/[^"]+)"/i);
if (!urlMatch) continue;
const url = urlMatch[1];
if (seen.has(url)) continue;
seen.add(url);
const nameMatch = card.match(/<h[34][^>]*>\s*<a[^>]*>([^<]{10,})<\/a>/i) ||
card.match(/<a[^>]*title="([^"]{10,})"/i);
if (!nameMatch) continue;
const name = nameMatch[1].trim().replace(/&amp;/g, "&").replace(/&#[0-9]+;/g, "");
if (name.length < 5) continue;
const priceMatch = card.match(/\$\s*([\d,]+\.?\d*)/);
const price = priceMatch ? parseFloat(priceMatch[1].replace(/,/g, "")) : undefined;
const reach = detectReach(name);
const partNumber = name.split(/\s+(?:compatible|for|sfp|qsfp)/i)[0]?.trim().slice(0, 80) || name.slice(0, 60);
products.push({
partNumber, name, url,
price: price && price > 0 && price < 100000 ? price : undefined,
formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps,
reachLabel: reach?.label, reachMeters: reach?.meters,
fiberType: detectFiber(name), wavelength: detectWavelength(name),
compatibleWith: extractCompatibleVendor(name),
});
}
// Strategy 2: Generic product link scan using matchAll
if (products.length === 0) {
for (const m of collapsed.matchAll(/href="(https?:\/\/(?:www\.)?qsfptek\.com\/(?:p|product)[^"?#]+)"[^>]*>([^<]{10,})</gi)) {
const url = m[1];
const name = m[2].trim().replace(/&amp;/g, "&");
if (seen.has(url) || name.length < 10) continue;
if (!/transceiver|sfp|qsfp|osfp|dac|aoc/i.test(name)) continue;
seen.add(url);
const idx = collapsed.indexOf(url);
const ctx = collapsed.slice(Math.max(0, idx - 300), idx + 600);
const priceM = ctx.match(/\$\s*([\d,]+\.?\d*)/);
const price = priceM ? parseFloat(priceM[1].replace(/,/g, "")) : undefined;
const reach = detectReach(name);
products.push({
partNumber: name.split(/\s+/)[0]?.slice(0, 80) || "",
name, url,
price: price && price > 0 && price < 100000 ? price : undefined,
formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps,
reachLabel: reach?.label, reachMeters: reach?.meters,
fiberType: detectFiber(name), wavelength: detectWavelength(name),
compatibleWith: extractCompatibleVendor(name),
});
}
}
return products;
}
async function fetchPage(url: string): Promise<string> {
const resp = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(30000) });
if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`);
return resp.text();
}
export async function scrapeQsfptek(): Promise<void> {
console.log("=== QSFPTEK Scraper Starting ===\n");
const vendorId = await ensureVendor(
"QSFPTEK",
"compatible",
"https://www.qsfptek.com",
"https://www.qsfptek.com/c/optical-transceiver.html",
);
let totalProducts = 0;
let priceUpdates = 0;
const seenCategories = new Set<string>();
for (const cat of CATEGORIES) {
console.log(`\n--- ${cat.formFactor} (${cat.speed}) [${cat.path}] ---`);
try {
const html1 = await fetchPage(BASE + cat.path);
const catProducts = parseProductList(html1, cat);
if (cat.path.includes("/optical-transceiver") && seenCategories.size > 3) {
console.log(` Skipping generic fallback (${seenCategories.size} specific categories scraped)`);
continue;
}
if (catProducts.length === 0) {
console.log(" No products on page 1 — skipping");
continue;
}
seenCategories.add(cat.path);
console.log(` Found ${catProducts.length} products on page 1`);
const totalPagesMatch =
html1.match(/total-page[^>]*>\s*(\d+)/) ||
html1.match(/page\s+\d+\s+of\s+(\d+)/i);
const totalPages = totalPagesMatch ? Math.min(parseInt(totalPagesMatch[1]), MAX_PAGES) : 3;
console.log(` Total pages (estimate): ${totalPages}`);
const allProducts = [...catProducts];
for (let page = 2; page <= totalPages; page++) {
await sleep(2000);
try {
const pageUrl = BASE + cat.path.replace(".html", "") + `?page=${page}`;
const html = await fetchPage(pageUrl);
const pageProds = parseProductList(html, cat);
if (pageProds.length === 0) break;
allProducts.push(...pageProds);
console.log(` Page ${page}: ${pageProds.length} products`);
} catch (err) {
console.warn(` Page ${page} failed: ${(err as Error).message.slice(0, 60)}`);
break;
}
}
const uniqueProducts = allProducts.filter((p, i, arr) => arr.findIndex((x) => x.url === p.url) === i);
console.log(` Total unique: ${uniqueProducts.length}`);
for (const product of uniqueProducts) {
try {
const txId = await findOrCreateScrapedTransceiver({
partNumber: product.partNumber,
vendorId,
formFactor: product.formFactor,
speedGbps: product.speedGbps,
speed: product.speed,
reachMeters: product.reachMeters,
reachLabel: product.reachLabel,
fiberType: product.fiberType,
wavelengths: product.wavelength,
category: "DataCenter",
});
if (product.price && product.price > 0) {
const hash = contentHash({ price: product.price, part: product.partNumber });
const updated = await upsertPriceObservation({
transceiverId: txId,
sourceVendorId: vendorId,
price: product.price,
currency: "USD",
stockLevel: "in_stock",
url: product.url,
contentHash: hash,
});
if (updated) priceUpdates++;
}
totalProducts++;
} catch (err) {
console.warn(` DB error: ${(err as Error).message.slice(0, 80)}`);
}
}
} catch (err) {
console.error(` Category failed: ${(err as Error).message}`);
}
await sleep(2000);
}
console.log(`\n=== QSFPTEK Complete: ${totalProducts} products, ${priceUpdates} price updates ===`);
}
if (require.main === module) {
scrapeQsfptek()
.then(() => pool.end())
.catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); });
}

View File

@ -203,7 +203,7 @@ export async function scrapeSfpCables(): Promise<void> {
}); });
if (product.price && product.price > 0) { if (product.price && product.price > 0) {
const hash = contentHash(JSON.stringify({ price: product.price, part: product.partNumber })); const hash = contentHash({ price: product.price, part: product.partNumber });
const updated = await upsertPriceObservation({ const updated = await upsertPriceObservation({
transceiverId: txId, transceiverId: txId,
sourceVendorId: vendorId, sourceVendorId: vendorId,

View File

@ -196,7 +196,7 @@ export async function scrape10Gtek(): Promise<void> {
}); });
if (product.price && product.price > 0) { if (product.price && product.price > 0) {
const hash = contentHash(JSON.stringify({ price: product.price, part: product.partNumber })); const hash = contentHash({ price: product.price, part: product.partNumber });
const updated = await upsertPriceObservation({ const updated = await upsertPriceObservation({
transceiverId: txId, transceiverId: txId,
sourceVendorId: vendorId, sourceVendorId: vendorId,