Replaces the GraphQL/search-based Flexoptix scraper with a proper
Magento 2 REST API integration that delivers authoritative SKUs,
prices, stock levels and compatibility data.
New files:
- packages/scraper/src/robots/flexoptix-api-sync.ts
Self-contained robot: auth → paginated fetch → normalize → DB write.
Reads FLEXOPTIX_API_BASE_URL / _USERNAME / _PASSWORD from env.
Returns { fetched, normalized, skipped, priceWrites, stockWrites }.
No file intermediary — in-memory pipeline.
- scripts/import-flexoptix-catalog.ts
One-shot CLI importer for the Pulso-generated JSONL (Codex handover).
- docs/FLEXOPTIX_CATALOG_IMPORT.md
Runbook for manual import + per-SKU specifications enrichment.
Scheduler changes:
- Added sync:flexoptix-catalog queue + work() handler
- Scheduled every 2h at 0 */2 * * * (same cadence as legacy job)
- scrape:pricing:flexoptix kept as legacy GraphQL fallback
Also includes Codex-generated additions from this sprint:
- audiocodes-oem scraper, seed-batch35/36/37, db.ts improvements,
sql/102 verification reconcile, README + package.json updates
82 lines
3.0 KiB
PL/PgSQL
82 lines
3.0 KiB
PL/PgSQL
-- Migration 102: Product photo/details verification reconciliation
|
|
-- Applied after the scraper started storing many image_url/product URLs without
|
|
-- consistently promoting image_verified/details_verified.
|
|
|
|
BEGIN;
|
|
|
|
-- Backfill canonical product URLs from recent real price observations.
|
|
UPDATE transceivers t
|
|
SET product_page_url = latest.url,
|
|
updated_at = NOW()
|
|
FROM (
|
|
SELECT DISTINCT ON (po.transceiver_id)
|
|
po.transceiver_id, po.url
|
|
FROM price_observations po
|
|
WHERE po.url IS NOT NULL
|
|
AND po.url != ''
|
|
AND po.time > NOW() - INTERVAL '180 days'
|
|
ORDER BY po.transceiver_id, po.time DESC
|
|
) latest
|
|
WHERE t.id = latest.transceiver_id
|
|
AND (t.product_page_url IS NULL OR t.product_page_url = '');
|
|
|
|
-- Any non-placeholder product image URL written by a scraper counts as an
|
|
-- image verification source. Older scrapers often set only has_image/image_url.
|
|
UPDATE transceivers
|
|
SET has_image = true,
|
|
image_verified = true,
|
|
image_verified_at = COALESCE(image_verified_at, NOW()),
|
|
image_verified_url = COALESCE(NULLIF(image_verified_url, ''), image_url),
|
|
updated_at = NOW()
|
|
WHERE image_url IS NOT NULL
|
|
AND image_url != ''
|
|
AND image_url !~* '(placeholder|no-image|no_image|keinbild|logo)'
|
|
AND (image_verified = false OR image_verified IS NULL);
|
|
|
|
-- Details are verified once a crawled source URL and the core product identity
|
|
-- fields are present. This avoids marking malformed scraper rows as complete.
|
|
UPDATE transceivers
|
|
SET details_verified = true,
|
|
details_verified_at = COALESCE(details_verified_at, NOW()),
|
|
details_source_url = COALESCE(NULLIF(details_source_url, ''), product_page_url),
|
|
data_confidence = CASE
|
|
WHEN data_confidence IS NULL OR data_confidence IN ('unknown', 'enriched_estimated')
|
|
THEN 'scraped_unverified'
|
|
ELSE data_confidence
|
|
END,
|
|
updated_at = NOW()
|
|
WHERE product_page_url IS NOT NULL
|
|
AND product_page_url != ''
|
|
AND form_factor IS NOT NULL
|
|
AND speed_gbps IS NOT NULL
|
|
AND part_number IS NOT NULL
|
|
AND part_number != ''
|
|
AND reach_label IS NOT NULL
|
|
AND reach_label != ''
|
|
AND fiber_type IS NOT NULL
|
|
AND fiber_type != ''
|
|
AND COALESCE(data_confidence, 'unknown') != 'garbage'
|
|
AND (details_verified = false OR details_verified IS NULL);
|
|
|
|
-- Refresh full badge after the promotions above.
|
|
UPDATE transceivers
|
|
SET fully_verified = true,
|
|
fully_verified_at = COALESCE(fully_verified_at, NOW()),
|
|
updated_at = NOW()
|
|
WHERE competitor_verified = true
|
|
AND price_verified = true
|
|
AND image_verified = true
|
|
AND details_verified = true
|
|
AND fully_verified = false;
|
|
|
|
COMMIT;
|
|
|
|
SELECT
|
|
COUNT(*) AS total,
|
|
COUNT(*) FILTER (WHERE image_url IS NOT NULL AND image_url != '') AS has_image_url,
|
|
COUNT(*) FILTER (WHERE image_verified) AS image_verified,
|
|
COUNT(*) FILTER (WHERE product_page_url IS NOT NULL AND product_page_url != '') AS has_product_page_url,
|
|
COUNT(*) FILTER (WHERE details_verified) AS details_verified,
|
|
COUNT(*) FILTER (WHERE fully_verified) AS fully_verified
|
|
FROM transceivers;
|