transceiver-db/sql/102-product-verification-reconcile.sql
Rene Fichtmueller a20094755d feat(scraper): Flexoptix REST API sync robot + scheduler integration
Replaces the GraphQL/search-based Flexoptix scraper with a proper
Magento 2 REST API integration that delivers authoritative SKUs,
prices, stock levels and compatibility data.

New files:
- packages/scraper/src/robots/flexoptix-api-sync.ts
  Self-contained robot: auth → paginated fetch → normalize → DB write.
  Reads FLEXOPTIX_API_BASE_URL / _USERNAME / _PASSWORD from env.
  Returns { fetched, normalized, skipped, priceWrites, stockWrites }.
  No file intermediary — in-memory pipeline.

- scripts/import-flexoptix-catalog.ts
  One-shot CLI importer for the Pulso-generated JSONL (Codex handover).

- docs/FLEXOPTIX_CATALOG_IMPORT.md
  Runbook for manual import + per-SKU specifications enrichment.

Scheduler changes:
- Added sync:flexoptix-catalog queue + work() handler
- Scheduled every 2h at 0 */2 * * * (same cadence as legacy job)
- scrape:pricing:flexoptix kept as legacy GraphQL fallback

Also includes Codex-generated additions from this sprint:
- audiocodes-oem scraper, seed-batch35/36/37, db.ts improvements,
  sql/102 verification reconcile, README + package.json updates
2026-05-13 16:36:33 +02:00

82 lines
3.0 KiB
PL/PgSQL

-- Migration 102: Product photo/details verification reconciliation
-- Applied after the scraper started storing many image_url/product URLs without
-- consistently promoting image_verified/details_verified.
BEGIN;
-- Backfill canonical product URLs from recent real price observations.
UPDATE transceivers t
SET product_page_url = latest.url,
updated_at = NOW()
FROM (
SELECT DISTINCT ON (po.transceiver_id)
po.transceiver_id, po.url
FROM price_observations po
WHERE po.url IS NOT NULL
AND po.url != ''
AND po.time > NOW() - INTERVAL '180 days'
ORDER BY po.transceiver_id, po.time DESC
) latest
WHERE t.id = latest.transceiver_id
AND (t.product_page_url IS NULL OR t.product_page_url = '');
-- Any non-placeholder product image URL written by a scraper counts as an
-- image verification source. Older scrapers often set only has_image/image_url.
UPDATE transceivers
SET has_image = true,
image_verified = true,
image_verified_at = COALESCE(image_verified_at, NOW()),
image_verified_url = COALESCE(NULLIF(image_verified_url, ''), image_url),
updated_at = NOW()
WHERE image_url IS NOT NULL
AND image_url != ''
AND image_url !~* '(placeholder|no-image|no_image|keinbild|logo)'
AND (image_verified = false OR image_verified IS NULL);
-- Details are verified once a crawled source URL and the core product identity
-- fields are present. This avoids marking malformed scraper rows as complete.
UPDATE transceivers
SET details_verified = true,
details_verified_at = COALESCE(details_verified_at, NOW()),
details_source_url = COALESCE(NULLIF(details_source_url, ''), product_page_url),
data_confidence = CASE
WHEN data_confidence IS NULL OR data_confidence IN ('unknown', 'enriched_estimated')
THEN 'scraped_unverified'
ELSE data_confidence
END,
updated_at = NOW()
WHERE product_page_url IS NOT NULL
AND product_page_url != ''
AND form_factor IS NOT NULL
AND speed_gbps IS NOT NULL
AND part_number IS NOT NULL
AND part_number != ''
AND reach_label IS NOT NULL
AND reach_label != ''
AND fiber_type IS NOT NULL
AND fiber_type != ''
AND COALESCE(data_confidence, 'unknown') != 'garbage'
AND (details_verified = false OR details_verified IS NULL);
-- Refresh full badge after the promotions above.
UPDATE transceivers
SET fully_verified = true,
fully_verified_at = COALESCE(fully_verified_at, NOW()),
updated_at = NOW()
WHERE competitor_verified = true
AND price_verified = true
AND image_verified = true
AND details_verified = true
AND fully_verified = false;
COMMIT;
SELECT
COUNT(*) AS total,
COUNT(*) FILTER (WHERE image_url IS NOT NULL AND image_url != '') AS has_image_url,
COUNT(*) FILTER (WHERE image_verified) AS image_verified,
COUNT(*) FILTER (WHERE product_page_url IS NOT NULL AND product_page_url != '') AS has_product_page_url,
COUNT(*) FILTER (WHERE details_verified) AS details_verified,
COUNT(*) FILTER (WHERE fully_verified) AS fully_verified
FROM transceivers;