fix: filter TIP hot topics quality
This commit is contained in:
parent
e5917a2250
commit
a0657ee565
@ -146,6 +146,7 @@ hotTopicsRouter.get("/", async (req, res) => {
|
||||
};
|
||||
|
||||
for (const m of marketIntel.rows) {
|
||||
if (!isUsableMarketIntel(m)) continue;
|
||||
const techList = Array.isArray(m.technologies) ? (m.technologies as string[]).join(", ") : "";
|
||||
const angle = m.buy_signal_implication
|
||||
? buySignalToAngle[m.buy_signal_implication] || m.buy_signal_implication
|
||||
@ -251,6 +252,8 @@ hotTopicsRouter.get("/", async (req, res) => {
|
||||
// ═══ SOURCE 7: Evergreen High-Value Topics ═══
|
||||
topics.push(...getEvergreenTopics(year));
|
||||
|
||||
const cleanTopics = dedupeTopics(topics.filter(isUsableTopic));
|
||||
|
||||
// Mark already-created topics and rank with daily rotation + source diversity.
|
||||
const recentDrafts = await pool.query(`
|
||||
SELECT title, created_at
|
||||
@ -269,7 +272,7 @@ hotTopicsRouter.get("/", async (req, res) => {
|
||||
|
||||
const daySeed = getDaySeed();
|
||||
const rotationSeed = daySeed + getQuerySeed(req.query.shuffle);
|
||||
for (const topic of topics) {
|
||||
for (const topic of cleanTopics) {
|
||||
const createdAt = createdByTitle.get(normalizeTopicTitle(topic.title));
|
||||
topic.blog_title_created = Boolean(createdAt);
|
||||
topic.last_blog_created_at = createdAt;
|
||||
@ -277,7 +280,7 @@ hotTopicsRouter.get("/", async (req, res) => {
|
||||
topic.llm_context = buildTopicBriefing(topic);
|
||||
}
|
||||
|
||||
const rankedTopics = selectDiverseTopics(topics, limit);
|
||||
const rankedTopics = selectDiverseTopics(cleanTopics, limit);
|
||||
|
||||
// Next daily rotation: tomorrow 00:00 UTC
|
||||
const tomorrow = new Date();
|
||||
@ -286,7 +289,7 @@ hotTopicsRouter.get("/", async (req, res) => {
|
||||
|
||||
res.json({
|
||||
topics: rankedTopics,
|
||||
total: topics.length,
|
||||
total: cleanTopics.length,
|
||||
generated_at: new Date().toISOString(),
|
||||
refreshes_at: tomorrow.toISOString(),
|
||||
day_seed: daySeed,
|
||||
@ -299,6 +302,68 @@ hotTopicsRouter.get("/", async (req, res) => {
|
||||
}
|
||||
});
|
||||
|
||||
function isUsableMarketIntel(row: Record<string, unknown>): boolean {
|
||||
const text = `${row.title || ""} ${row.summary || ""} ${row.source_name || ""}`.toLowerCase();
|
||||
const tech = Array.isArray(row.technologies) ? row.technologies.join(" ").toLowerCase() : String(row.technologies || "").toLowerCase();
|
||||
|
||||
const rejectPatterns = [
|
||||
/privacy shield/,
|
||||
/personal data/,
|
||||
/data transfer framework/,
|
||||
/gdpr/,
|
||||
/cookie/,
|
||||
/privacy policy/,
|
||||
/operational amplifier/,
|
||||
/\boamp\b/,
|
||||
];
|
||||
if (rejectPatterns.some((pattern) => pattern.test(text))) return false;
|
||||
|
||||
const opticalSignals = [
|
||||
"transceiver", "optic", "optical", "photonics", "fiber", "fibre", "400g", "800g",
|
||||
"1.6t", "qsfp", "osfp", "sfp", "coherent", "zr", "cpo", "lpo", "silicon photonics",
|
||||
"data center", "datacenter", "ai fabric", "ethernet", "dci",
|
||||
];
|
||||
return opticalSignals.some((signal) => text.includes(signal) || tech.includes(signal));
|
||||
}
|
||||
|
||||
function isUsableTopic(topic: HotTopic): boolean {
|
||||
const text = `${topic.title} ${topic.description} ${topic.source}`.toLowerCase();
|
||||
return ![
|
||||
/privacy shield/,
|
||||
/personal data/,
|
||||
/data transfer framework/,
|
||||
/gdpr/,
|
||||
/cookie/,
|
||||
/privacy policy/,
|
||||
/operational amplifier/,
|
||||
/\boamp\b/,
|
||||
].some((pattern) => pattern.test(text));
|
||||
}
|
||||
|
||||
function dedupeTopics(topics: HotTopic[]): HotTopic[] {
|
||||
const byKey = new Map<string, HotTopic>();
|
||||
for (const topic of topics) {
|
||||
const key = normalizeTopicTitle(topic.title);
|
||||
const previous = byKey.get(key);
|
||||
if (!previous || scorePreRankTopic(topic) > scorePreRankTopic(previous)) {
|
||||
byKey.set(key, topic);
|
||||
}
|
||||
}
|
||||
return Array.from(byKey.values());
|
||||
}
|
||||
|
||||
function scorePreRankTopic(topic: HotTopic): number {
|
||||
const urgencyScore: Record<HotTopic["urgency"], number> = {
|
||||
breaking: 4,
|
||||
hot: 3,
|
||||
trending: 2,
|
||||
emerging: 1,
|
||||
};
|
||||
const freshness = topic.date ? Math.max(0, 30 - ((Date.now() - new Date(topic.date).getTime()) / 86400000)) : 0;
|
||||
const dataScore = topic.data_context ? 1 : 0;
|
||||
return (urgencyScore[topic.urgency] || 0) * 100 + freshness + dataScore;
|
||||
}
|
||||
|
||||
function normalizeTopicTitle(title: string): string {
|
||||
return title
|
||||
.toLowerCase()
|
||||
|
||||
@ -1,9 +1,23 @@
|
||||
# Current TIP Sync State
|
||||
|
||||
Updated: 2026-05-10 13:25 UTC
|
||||
Updated: 2026-05-10 13:55 UTC
|
||||
|
||||
## Newest Work
|
||||
|
||||
- TIP Blog Engine Hot Topics cleanup on 2026-05-10 UTC:
|
||||
- fixed the repeated/irrelevant Hot Topics problem the user reported earlier
|
||||
- `packages/api/src/routes/hot-topics.ts` now filters bad market-intelligence artifacts:
|
||||
- Privacy Shield
|
||||
- personal data / data transfer / GDPR / cookie/privacy-policy topics
|
||||
- operational amplifier / OAmp electronics-only artifacts
|
||||
- market-intelligence topics now require direct optical/networking signals such as transceiver, optic, optical, photonics, fiber, 400G, 800G, 1.6T, QSFP, OSFP, SFP, coherent, ZR, CPO, LPO, silicon photonics, data center, AI fabric, Ethernet or DCI
|
||||
- topics are now deduplicated by normalized title before scoring and selection
|
||||
- deployed route to Erik, API build passed, `tip-api` restarted
|
||||
- verification:
|
||||
- `/api/hot-topics?limit=10` no longer returns Privacy Shield duplicates or OAmp topics
|
||||
- default and `shuffle=alt` outputs differ in ranking while staying stable per seed
|
||||
- TIPLLM training pool updated with the Hot Topics quality filter rule
|
||||
|
||||
- TIP active-vs-total product counter correction on 2026-05-10 UTC:
|
||||
- fixed another UI ambiguity: the header/Overview showed raw `transceivers` table count (`17781`) while research status correctly used active product scope (`16236`)
|
||||
- changed dashboard header label from `transceivers` to `active products`
|
||||
|
||||
49
sync/history/2026-05-10-tip-hot-topics-quality-filter.md
Normal file
49
sync/history/2026-05-10-tip-hot-topics-quality-filter.md
Normal file
@ -0,0 +1,49 @@
|
||||
# TIP Hot Topics Quality Filter
|
||||
|
||||
Date: 2026-05-10 UTC
|
||||
Owner: Codex
|
||||
|
||||
## Summary
|
||||
|
||||
The Blog Engine Hot Topics endpoint was surfacing repeated and irrelevant topics, including Privacy Shield duplicates and operational amplifier content.
|
||||
|
||||
## Change
|
||||
|
||||
Updated `packages/api/src/routes/hot-topics.ts`:
|
||||
|
||||
- filters privacy/legal/compliance artifacts:
|
||||
- Privacy Shield
|
||||
- personal data
|
||||
- data transfer framework
|
||||
- GDPR
|
||||
- cookies/privacy-policy topics
|
||||
- filters electronics-only artifacts:
|
||||
- operational amplifier
|
||||
- OAmp
|
||||
- requires direct optical/networking signals for market-intelligence rows:
|
||||
- transceiver
|
||||
- optic / optical
|
||||
- photonics
|
||||
- fiber/fibre
|
||||
- 400G / 800G / 1.6T
|
||||
- QSFP / OSFP / SFP
|
||||
- coherent / ZR
|
||||
- CPO / LPO
|
||||
- silicon photonics
|
||||
- data center / AI fabric / Ethernet / DCI
|
||||
- deduplicates topics by normalized title before scoring and selection.
|
||||
|
||||
## Deployment
|
||||
|
||||
- Deployed to Erik.
|
||||
- API build passed.
|
||||
- Restarted `tip-api`.
|
||||
|
||||
## Verification
|
||||
|
||||
- `/api/hot-topics?limit=10` no longer returns Privacy Shield duplicates or OAmp topics.
|
||||
- `/api/hot-topics?limit=10&shuffle=alt` produces a different ranking while preserving source quality.
|
||||
|
||||
## Training Pool
|
||||
|
||||
Added `tip-20260510-hot-topics-quality-filter-001` to `training-data/tip-llm-capabilities-v1.jsonl`.
|
||||
@ -66,3 +66,4 @@
|
||||
{"id":"tip-20260510-review-badge-zero-rule-001","source":"codex-tip-manual-review-closure-2026-05-10","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Dashboard counters should show current work, not historical tables."},{"role":"user","content":"How should the Manual Review badge be counted after research-resolution closure?"},{"role":"assistant","content":"Count `pending` manual review only for equivalence rows where `eq.status='pending'` and the Flexoptix product still has `competitor_status IN ('unknown','needs_research')`. Do not show stale pending candidates for products already resolved as `matched`, `no_valid_match`, or `ambiguous`. After the 2026-05-10 cleanup, `transceiver_equivalences` had `pending=0`; the dashboard Review badge should hide."}]}
|
||||
{"id":"tip-20260510-dashboard-research-status-001","source":"codex-tip-dashboard-closure-2026-05-10","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. The dashboard must communicate operational truth: completed research is not the same as strict public-source verification."},{"role":"user","content":"The Overview page shows Price Verified 67%, Image Verified 71%, Details Verified 99%, Fully Verified 65%, even though research_resolved is 100%. How should TIP present this?"},{"role":"assistant","content":"On the Overview, show `Data Research Status` using terminal research buckets, not only strict booleans. Cards should report `Price Resolved`, `Image Resolved`, `Details Resolved`, `Competitor Resolved`, and `All Research Resolved`. Compute resolved counts from status buckets: public plus no-public plus ambiguous. Keep strict `fully_verified` visible only as a source-backed quality metric, not as an open-work progress bar. This prevents the dashboard from implying manual work is still open when all `needs_research` buckets are zero."}]}
|
||||
{"id":"tip-20260510-active-vs-total-products-001","source":"codex-tip-dashboard-closure-2026-05-10","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Make dashboard counters distinguish active product scope from raw database row counts."},{"role":"user","content":"Why does TIP show 17,781 transceivers at the top but research status says 16,236 products?"},{"role":"assistant","content":"The larger number is the raw `transceivers` table count, including archived/quarantined rows such as NonTransceiver, AOC/DAC families, category/catalog artifacts, and historical rows. The active research scope is `verification.total`, currently 16,236. The dashboard should label the top KPI as `Active Products`, use the active count in the header/overview, and show raw total rows only as context such as `17,781 total rows · 1,545 archived/quarantined`."}]}
|
||||
{"id":"tip-20260510-hot-topics-quality-filter-001","source":"codex-tip-blog-engine-2026-05-10","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Hot Topics must be useful source-backed blog prompts, not repeated irrelevant crawler artifacts."},{"role":"user","content":"The Blog Engine Hot Topics keep showing the same irrelevant topics like Privacy Shield or operational amplifier content. What should TIP do?"},{"role":"assistant","content":"Filter market-intelligence rows before ranking. Reject privacy/legal/compliance artifacts such as Privacy Shield, GDPR, cookies, personal data and data-transfer framework topics. Reject electronics-only artifacts such as operational amplifier/OAmp unless they have a direct optical-transceiver context. Require optical/networking signals such as transceiver, optic, optical, photonics, fiber, 400G, 800G, 1.6T, QSFP, OSFP, SFP, coherent, ZR, CPO, LPO, silicon photonics, data center, AI fabric, Ethernet or DCI. Dedupe by normalized title before ranking, then use seeded daily rotation and optional shuffle for varied output."}]}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user