Files
homeproz/contract/Contracts/WordPress-Website/_scratch/agent_scrape/scrape-agents.js
T
Hanson.xyz Dev acc8ac87a0 wip
2026-01-04 17:50:08 -06:00

146 lines
4.8 KiB
JavaScript
Executable File

/**
* Agent Image Scraper for HomeProz
*/
const { chromium } = require('playwright');
const fs = require('fs');
const path = require('path');
const https = require('https');
const OUTPUT_DIR = path.join(__dirname, 'images');
async function downloadImage(url, filepath) {
return new Promise((resolve, reject) => {
const file = fs.createWriteStream(filepath);
https.get(url, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
}, (response) => {
if (response.statusCode === 301 || response.statusCode === 302) {
downloadImage(response.headers.location, filepath).then(resolve).catch(reject);
return;
}
if (response.statusCode !== 200) {
reject(new Error(`Failed to download: ${response.statusCode}`));
return;
}
response.pipe(file);
file.on('finish', () => {
file.close();
resolve(filepath);
});
}).on('error', (err) => {
fs.unlink(filepath, () => {});
reject(err);
});
});
}
async function main() {
console.log('Starting agent image scraper...\n');
const browser = await chromium.launch({
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox']
});
const context = await browser.newContext({
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
viewport: { width: 1920, height: 1080 }
});
const page = await context.newPage();
console.log('Loading homeprozrealestate.com...');
await page.goto('https://homeprozrealestate.com/', { waitUntil: 'networkidle', timeout: 60000 });
await page.waitForTimeout(3000);
// Take screenshot of team section
await page.screenshot({ path: path.join(__dirname, 'homepage.png'), fullPage: true });
console.log('Screenshot saved\n');
// Try to find agent images
const agentData = await page.evaluate(() => {
const agents = [];
// Look for image elements that might be agent photos
const allImages = document.querySelectorAll('img');
allImages.forEach(img => {
const src = img.src || img.dataset.src;
const alt = img.alt || '';
// Check if this might be an agent photo
if (src && (
alt.toLowerCase().includes('anna') ||
alt.toLowerCase().includes('davy') ||
alt.toLowerCase().includes('jordan') ||
alt.toLowerCase().includes('lily') ||
alt.toLowerCase().includes('agent') ||
alt.toLowerCase().includes('realtor') ||
src.includes('agent') ||
src.includes('team')
)) {
agents.push({
src: src,
alt: alt,
width: img.naturalWidth || img.width,
height: img.naturalHeight || img.height
});
}
});
// Also get all images with decent size (likely profile photos)
const largeImages = [];
allImages.forEach(img => {
const src = img.src || img.dataset.src;
if (src && !src.includes('logo') && !src.includes('icon')) {
const rect = img.getBoundingClientRect();
if (rect.width > 100 && rect.height > 100) {
largeImages.push({
src: src,
alt: img.alt || '',
width: rect.width,
height: rect.height
});
}
}
});
return { agents, largeImages };
});
console.log('Agent-related images found:', agentData.agents.length);
console.log('Large images found:', agentData.largeImages.length);
// Save all found image URLs
fs.writeFileSync(
path.join(__dirname, 'found-images.json'),
JSON.stringify(agentData, null, 2)
);
// Download large images
let imgCount = 0;
for (const img of agentData.largeImages) {
if (img.src && img.src.startsWith('http')) {
imgCount++;
const ext = img.src.includes('.png') ? 'png' : 'jpg';
const filename = `image-${imgCount}.${ext}`;
const filepath = path.join(OUTPUT_DIR, filename);
try {
await downloadImage(img.src, filepath);
console.log(`Downloaded: ${filename} (${img.alt || 'no alt'})`);
} catch (err) {
console.log(`Failed to download ${img.src}: ${err.message}`);
}
}
}
await browser.close();
console.log('\nDone!');
}
main().catch(console.error);