- Register scrape:pricing:naddod (48 */2), qsfptek (52 */2), addon (55 */2) in pg-boss - Add boss.work() handlers for all three (fetch-based, run on Erik) - Fix findOrCreateScrapedTransceiver callers: remove invalid `name`/`url` params, fix `t.id` → `t` (function already returns string ID) - Fix ebay-enricher: remove invalid `extractType` option, use extraction.standard_name instead of non-existent `.description`, fix cheerio type incompatibility - Fix community-issues: description → summary, publishedDate → published_at - Startup zombie cleanup already deployed (index.ts) — no changes needed - ProLabs rewritten to fetch-based catalog scraper (no Playwright, bypasses WAF)
451 lines
17 KiB
TypeScript
451 lines
17 KiB
TypeScript
/**
|
||
* eBay Product Enricher
|
||
*
|
||
* Searches eBay for switch/transceiver models to extract:
|
||
* - Product description & features
|
||
* - Refurbished/used prices
|
||
* - Product images
|
||
* - Technical specs from listing descriptions
|
||
*
|
||
* Uses CheerioCrawler + Crawler LLM for structured extraction.
|
||
*/
|
||
|
||
import { CheerioCrawler, RequestQueue } from "crawlee";
|
||
import { scrapeWithLLM } from "../crawler-llm/core";
|
||
import { db } from "../utils/db";
|
||
import { logger } from "../utils/logger";
|
||
|
||
interface EbayListing {
|
||
title: string;
|
||
price: number;
|
||
currency: string;
|
||
condition: "new" | "refurbished" | "used";
|
||
seller: string;
|
||
warrantyMonths: number | null;
|
||
imageUrl: string | null;
|
||
listingUrl: string;
|
||
itemId: string;
|
||
description: string;
|
||
features: string[];
|
||
specs: Record<string, string>;
|
||
}
|
||
|
||
interface EnrichResult {
|
||
model: string;
|
||
listings: EbayListing[];
|
||
bestRefurbPrice: number | null;
|
||
bestNewPrice: number | null;
|
||
features: string[];
|
||
description: string;
|
||
imageUrl: string | null;
|
||
}
|
||
|
||
// eBay search URL for .de (EUR pricing, covers DE/EU market)
|
||
function buildSearchUrl(query: string, page = 1): string {
|
||
const encoded = encodeURIComponent(query);
|
||
const offset = (page - 1) * 50;
|
||
return `https://www.ebay.de/sch/i.html?_nkw=${encoded}&_sop=15&LH_ItemCondition=3000%7C1500%7C1000&_ipg=50&_pgn=${page}&_stpos=0&_from=R40`;
|
||
}
|
||
|
||
// Parse eBay condition string to our condition type
|
||
function parseCondition(condStr: string): "new" | "refurbished" | "used" {
|
||
const lower = condStr.toLowerCase();
|
||
if (lower.includes("neu") || lower.includes("new")) return "new";
|
||
if (lower.includes("refurb") || lower.includes("überholt") || lower.includes("generalüber")) return "refurbished";
|
||
return "used";
|
||
}
|
||
|
||
// Extract warranty months from listing title/description
|
||
function extractWarranty(text: string): number | null {
|
||
const patterns = [
|
||
/(\d+)\s*[-–]?\s*month\s*warrant/i,
|
||
/(\d+)\s*[-–]?\s*monat\s*gewähr/i,
|
||
/(\d+)\s*[-–]?\s*year\s*warrant/i,
|
||
/(\d+)\s*[-–]?\s*jahr\s*gewähr/i,
|
||
];
|
||
for (const pattern of patterns) {
|
||
const match = text.match(pattern);
|
||
if (match && match[1]) {
|
||
const num = parseInt(match[1]);
|
||
return pattern.source.includes("year") || pattern.source.includes("jahr") ? num * 12 : num;
|
||
}
|
||
}
|
||
return null;
|
||
}
|
||
|
||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||
async function parseSearchResults($: any, baseUrl: string): Promise<Array<{ title: string; url: string; price: string; condition: string; imageUrl: string }>> {
|
||
const items: Array<{ title: string; url: string; price: string; condition: string; imageUrl: string }> = [];
|
||
|
||
$(".s-item").each((_: number, el: unknown) => {
|
||
const titleEl = $(el).find(".s-item__title");
|
||
const priceEl = $(el).find(".s-item__price");
|
||
const condEl = $(el).find(".SECONDARY_INFO");
|
||
const linkEl = $(el).find(".s-item__link");
|
||
const imgEl = $(el).find(".s-item__image-img");
|
||
|
||
const title = titleEl.text().trim();
|
||
const price = priceEl.text().trim();
|
||
const condition = condEl.text().trim();
|
||
const url = linkEl.attr("href") || "";
|
||
const imageUrl = imgEl.attr("src") || imgEl.attr("data-src") || "";
|
||
|
||
if (title && url && !title.toLowerCase().includes("shop on ebay")) {
|
||
items.push({ title, url, price, condition, imageUrl });
|
||
}
|
||
});
|
||
|
||
return items;
|
||
}
|
||
|
||
async function enrichSwitchFromEbay(switchId: string, model: string): Promise<EnrichResult | null> {
|
||
const result: EnrichResult = {
|
||
model,
|
||
listings: [],
|
||
bestRefurbPrice: null,
|
||
bestNewPrice: null,
|
||
features: [],
|
||
description: "",
|
||
imageUrl: null,
|
||
};
|
||
|
||
const queue = await RequestQueue.open(`ebay-${switchId.substring(0, 8)}`);
|
||
await queue.addRequest({ url: buildSearchUrl(model), userData: { model, phase: "search" } });
|
||
|
||
const crawler = new CheerioCrawler({
|
||
requestQueue: queue,
|
||
maxRequestsPerCrawl: 5,
|
||
requestHandlerTimeoutSecs: 30,
|
||
|
||
async requestHandler({ request, $, crawler }) {
|
||
const { phase, model } = request.userData as { phase: string; model: string };
|
||
|
||
if (phase === "search") {
|
||
const items = await parseSearchResults($, request.url);
|
||
|
||
// Take up to 3 most relevant listings
|
||
const relevant = items.filter(item =>
|
||
item.title.toLowerCase().includes(model.toLowerCase().split("-")[0]?.toLowerCase() ?? "")
|
||
).slice(0, 3);
|
||
|
||
for (const item of relevant) {
|
||
if (item.url && item.url.startsWith("http")) {
|
||
await crawler.addRequests([{
|
||
url: item.url.split("?")[0]!,
|
||
userData: {
|
||
phase: "listing",
|
||
model,
|
||
priceStr: item.price,
|
||
conditionStr: item.condition,
|
||
imageUrl: item.imageUrl,
|
||
title: item.title,
|
||
},
|
||
}]);
|
||
}
|
||
}
|
||
} else if (phase === "listing") {
|
||
const { title, priceStr, conditionStr, imageUrl: searchImageUrl, model } = request.userData as {
|
||
title: string; priceStr: string; conditionStr: string; imageUrl: string; model: string;
|
||
};
|
||
|
||
// Use Crawler LLM to extract structured data from listing page
|
||
const html = $.html();
|
||
const extracted = await scrapeWithLLM(html, request.url, {
|
||
vendorSlug: "ebay",
|
||
});
|
||
|
||
// Parse price from string (handle EUR format "1.234,56 EUR")
|
||
const priceClean = priceStr.replace(/[^\d,.-]/g, "").replace(".", "").replace(",", ".");
|
||
const price = parseFloat(priceClean) || 0;
|
||
|
||
const condition = parseCondition(conditionStr);
|
||
const warranty = extractWarranty(title);
|
||
|
||
// Extract image from listing page (higher quality than search thumbnail)
|
||
const listingImage = $(".ux-image-carousel-item img").first().attr("src")
|
||
|| $(".img img").first().attr("src")
|
||
|| searchImageUrl;
|
||
|
||
// Extract features from item specifics table
|
||
const features: string[] = [];
|
||
$(".ux-labels-values").each((_, el) => {
|
||
const label = $(el).find(".ux-labels-values__labels").text().trim();
|
||
const value = $(el).find(".ux-labels-values__values").text().trim();
|
||
if (label && value && value !== "Siehe Anzeige") {
|
||
features.push(`${label}: ${value}`);
|
||
}
|
||
});
|
||
|
||
// Extract description
|
||
const description = extracted?.extraction.standard_name
|
||
|| $(".ux-textspans--BOLD").first().text().trim()
|
||
|| "";
|
||
|
||
const listing: EbayListing = {
|
||
title,
|
||
price,
|
||
currency: "EUR",
|
||
condition,
|
||
seller: $(".ux-seller-section__item--seller a").text().trim() || "unknown",
|
||
warrantyMonths: warranty,
|
||
imageUrl: listingImage || null,
|
||
listingUrl: request.url,
|
||
itemId: request.url.match(/\/itm\/(\d+)/)?.[1] || "",
|
||
description,
|
||
features,
|
||
specs: {},
|
||
};
|
||
|
||
result.listings.push(listing);
|
||
|
||
// Track best prices
|
||
if (price > 0) {
|
||
if (condition === "refurbished" || condition === "used") {
|
||
if (!result.bestRefurbPrice || price < result.bestRefurbPrice) {
|
||
result.bestRefurbPrice = price;
|
||
}
|
||
} else if (condition === "new") {
|
||
if (!result.bestNewPrice || price < result.bestNewPrice) {
|
||
result.bestNewPrice = price;
|
||
}
|
||
}
|
||
}
|
||
|
||
// Collect features for switch enrichment
|
||
if (features.length > 0 && result.features.length === 0) {
|
||
result.features = features.slice(0, 10);
|
||
}
|
||
|
||
// Use best image
|
||
if (!result.imageUrl && listingImage) {
|
||
result.imageUrl = listingImage;
|
||
}
|
||
|
||
// Use first good description
|
||
if (!result.description && description.length > 50) {
|
||
result.description = description.substring(0, 500);
|
||
}
|
||
}
|
||
},
|
||
failedRequestHandler: ({ request, error }) => {
|
||
logger.warn(`eBay enricher failed for ${request.url}: ${error}`);
|
||
},
|
||
});
|
||
|
||
try {
|
||
await crawler.run();
|
||
} catch (err) {
|
||
logger.error("eBay crawler run error", { err, model });
|
||
}
|
||
|
||
return result.listings.length > 0 ? result : null;
|
||
}
|
||
|
||
// ─────────────────────────────────────────────────────────────────────────────
|
||
// Save enrichment results to DB
|
||
// ─────────────────────────────────────────────────────────────────────────────
|
||
async function saveEnrichment(switchId: string, result: EnrichResult): Promise<void> {
|
||
const { db: pool } = await import("../utils/db");
|
||
|
||
// Update switch: features, description, refurb price, image
|
||
const updateFields: string[] = ["ebay_enriched_at = NOW()"];
|
||
const params: unknown[] = [];
|
||
let idx = 1;
|
||
|
||
if (result.features.length > 0) {
|
||
updateFields.push(`features = $${idx}::jsonb`);
|
||
params.push(JSON.stringify(result.features));
|
||
idx++;
|
||
}
|
||
|
||
if (result.description) {
|
||
updateFields.push(`description = COALESCE(description, $${idx})`);
|
||
params.push(result.description);
|
||
idx++;
|
||
}
|
||
|
||
if (result.bestRefurbPrice) {
|
||
updateFields.push(`ebay_refurb_price_usd = $${idx}`);
|
||
params.push(result.bestRefurbPrice);
|
||
idx++;
|
||
}
|
||
|
||
if (result.imageUrl && result.imageUrl.startsWith("http")) {
|
||
// Only set image_url if not already set
|
||
updateFields.push(`image_url = COALESCE(NULLIF(image_url, ''), $${idx})`);
|
||
params.push(result.imageUrl);
|
||
idx++;
|
||
}
|
||
|
||
params.push(switchId);
|
||
await pool.query(
|
||
`UPDATE switches SET ${updateFields.join(", ")} WHERE id = $${idx}`,
|
||
params
|
||
);
|
||
|
||
// Find eBay vendor ID (create if needed)
|
||
const ebayVendorResult = await pool.query(
|
||
`INSERT INTO vendors (name, slug, type, website_url)
|
||
VALUES ('eBay Marketplace', 'ebay', 'marketplace', 'https://www.ebay.de')
|
||
ON CONFLICT (slug) DO UPDATE SET name = EXCLUDED.name
|
||
RETURNING id`
|
||
);
|
||
const ebayVendorId = ebayVendorResult.rows[0]?.id;
|
||
if (!ebayVendorId) return;
|
||
|
||
// For each listing that has a price, we need a transceiver_id or we skip
|
||
// (price_observations requires transceiver_id — for switches we'll use a different approach later)
|
||
// For now, just log the refurb price data
|
||
logger.info("eBay enrichment saved", {
|
||
model: result.model,
|
||
listingsCount: result.listings.length,
|
||
bestRefurb: result.bestRefurbPrice,
|
||
featuresCount: result.features.length,
|
||
hasImage: !!result.imageUrl,
|
||
});
|
||
}
|
||
|
||
// ─────────────────────────────────────────────────────────────────────────────
|
||
// Main: enrich switches that haven't been enriched yet
|
||
// ─────────────────────────────────────────────────────────────────────────────
|
||
export async function enrichSwitchesFromEbay(limit = 20): Promise<void> {
|
||
const { db: pool } = await import("../utils/db");
|
||
|
||
const switches = await pool.query<{ id: string; model: string; vendor_name: string }>(
|
||
`SELECT sw.id, sw.model, v.name AS vendor_name
|
||
FROM switches sw
|
||
JOIN vendors v ON sw.vendor_id = v.id
|
||
WHERE sw.ebay_enriched_at IS NULL
|
||
AND sw.max_speed_gbps >= 10
|
||
ORDER BY sw.max_speed_gbps DESC, sw.created_at ASC
|
||
LIMIT $1`,
|
||
[limit]
|
||
);
|
||
|
||
logger.info(`eBay enricher: processing ${switches.rows.length} switches`);
|
||
|
||
for (const sw of switches.rows) {
|
||
logger.info(`Enriching ${sw.model} from eBay...`);
|
||
try {
|
||
const result = await enrichSwitchFromEbay(sw.id, sw.model);
|
||
if (result) {
|
||
await saveEnrichment(sw.id, result);
|
||
logger.info(`✓ ${sw.model}: ${result.listings.length} listings, refurb €${result.bestRefurbPrice}`);
|
||
} else {
|
||
// Mark as tried even if no results
|
||
await pool.query("UPDATE switches SET ebay_enriched_at = NOW() WHERE id = $1", [sw.id]);
|
||
logger.info(`○ ${sw.model}: no eBay listings found`);
|
||
}
|
||
} catch (err) {
|
||
logger.error(`✗ ${sw.model}: enrichment failed`, { err });
|
||
}
|
||
|
||
// Rate limiting — be polite to eBay
|
||
await new Promise(r => setTimeout(r, 3000 + Math.random() * 2000));
|
||
}
|
||
}
|
||
|
||
// ─────────────────────────────────────────────────────────────────────────────
|
||
// Enrich transceivers from eBay (price observations with condition)
|
||
// ─────────────────────────────────────────────────────────────────────────────
|
||
export async function enrichTransceiversFromEbay(limit = 50): Promise<void> {
|
||
const { db: pool } = await import("../utils/db");
|
||
|
||
// Find eBay vendor
|
||
const ebayVendor = await pool.query(
|
||
`INSERT INTO vendors (name, slug, type, website_url)
|
||
VALUES ('eBay Marketplace', 'ebay', 'marketplace', 'https://www.ebay.de')
|
||
ON CONFLICT (slug) DO UPDATE SET updated_at = NOW()
|
||
RETURNING id`
|
||
);
|
||
const ebayVendorId = ebayVendor.rows[0]?.id;
|
||
|
||
// Get transceivers without eBay price observations in last 30 days
|
||
const transceivers = await pool.query<{ id: string; slug: string; part_number: string; form_factor: string; speed_gbps: number }>(
|
||
`SELECT t.id, t.slug, t.part_number, t.form_factor, t.speed_gbps
|
||
FROM transceivers t
|
||
WHERE NOT EXISTS (
|
||
SELECT 1 FROM price_observations po
|
||
WHERE po.transceiver_id = t.id
|
||
AND po.marketplace = 'ebay'
|
||
AND po.time > NOW() - INTERVAL '30 days'
|
||
)
|
||
AND t.part_number IS NOT NULL
|
||
ORDER BY t.speed_gbps DESC
|
||
LIMIT $1`,
|
||
[limit]
|
||
);
|
||
|
||
logger.info(`eBay transceiver enricher: processing ${transceivers.rows.length} transceivers`);
|
||
|
||
const queue = await RequestQueue.open("ebay-transceivers");
|
||
|
||
for (const tcvr of transceivers.rows) {
|
||
const query = tcvr.part_number || `${tcvr.form_factor} ${tcvr.speed_gbps}G transceiver`;
|
||
await queue.addRequest({
|
||
url: buildSearchUrl(query),
|
||
userData: { transceiverI: tcvr.id, query, formFactor: tcvr.form_factor, speedGbps: tcvr.speed_gbps },
|
||
});
|
||
}
|
||
|
||
const crawler = new CheerioCrawler({
|
||
requestQueue: queue,
|
||
maxRequestsPerCrawl: limit,
|
||
requestHandlerTimeoutSecs: 20,
|
||
maxConcurrency: 2,
|
||
|
||
async requestHandler({ request, $ }) {
|
||
const { transceiverI, formFactor, speedGbps } = request.userData as {
|
||
transceiverI: string; query: string; formFactor: string; speedGbps: number;
|
||
};
|
||
|
||
const items = await parseSearchResults($, request.url);
|
||
const refurbItems = items.filter(i => {
|
||
const cond = i.condition.toLowerCase();
|
||
return cond.includes("refurb") || cond.includes("überholt") || cond.includes("generalüber");
|
||
});
|
||
const newItems = items.filter(i => i.condition.toLowerCase().includes("neu") || i.condition.toLowerCase().includes("new"));
|
||
|
||
const insertObs = async (item: { price: string; condition: string; imageUrl: string; title: string; url: string }, condition: "new" | "refurbished") => {
|
||
const priceClean = item.price.replace(/[^\d,.-]/g, "").replace(".", "").replace(",", ".");
|
||
const price = parseFloat(priceClean);
|
||
if (!price || price <= 0) return;
|
||
|
||
const warranty = extractWarranty(item.title);
|
||
|
||
await pool.query(
|
||
`INSERT INTO price_observations
|
||
(time, transceiver_id, source_vendor_id, price, currency, condition, marketplace, warranty_months, seller_name, listing_title, url, scrape_method, stock_level)
|
||
VALUES (NOW(), $1, $2, $3, 'EUR', $4, 'ebay', $5, $6, $7, $8, 'crawlee', 'in_stock')
|
||
ON CONFLICT DO NOTHING`,
|
||
[transceiverI, ebayVendorId, price, condition, warranty, "eBay Seller", item.title.substring(0, 200), item.url]
|
||
);
|
||
};
|
||
|
||
// Best refurbished price
|
||
if (refurbItems[0]) await insertObs(refurbItems[0], "refurbished");
|
||
// Best new price
|
||
if (newItems[0]) await insertObs(newItems[0], "new");
|
||
},
|
||
});
|
||
|
||
try {
|
||
await crawler.run();
|
||
} catch (err) {
|
||
logger.error("eBay transceiver crawler error", { err });
|
||
}
|
||
}
|
||
|
||
// CLI entrypoint
|
||
if (require.main === module) {
|
||
(async () => {
|
||
const target = process.argv[2] || "switches";
|
||
if (target === "switches") {
|
||
await enrichSwitchesFromEbay(parseInt(process.argv[3] || "20"));
|
||
} else {
|
||
await enrichTransceiversFromEbay(parseInt(process.argv[3] || "50"));
|
||
}
|
||
process.exit(0);
|
||
})();
|
||
}
|