Files
homeproz/contract/Contracts/WordPress-Website/_scratch/property_scrape/scrape-properties.js
T
Hanson.xyz Dev 775c57a678 Property filters overhaul: status sorting, simplified UI
- Remove status dropdown (always show all properties)
- Remove sort dropdown (use status-based sorting)
- Sort order: Active > Pending > Sold, then by modified date
- Map view: half height, 2-column property grid
- Beds field same width as others
- Add CLAUDE.md documentation for property system

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-30 19:52:50 -06:00

375 lines
13 KiB
JavaScript

/**
* Property Scraper for HomeProz Real Estate
*
* Scrapes property listings from homefinder.com and zillow.com
* Extracts: name, address, details, description, and images
*/
const { chromium } = require('playwright');
const fs = require('fs');
const path = require('path');
const https = require('https');
const http = require('http');
// Property listings to scrape
const PROPERTIES = [
{
id: 'geneva-mn-lot',
name: 'Geneva, MN - Double Lot with Shop',
url: 'https://homefinder.com/realestateandhomes-detail/Geneva_MN_56035_M93901-25044?from=srp-list-card',
source: 'homefinder'
},
{
id: '115-newton-n-ave',
name: '115 Newton N Ave Albert Lea, MN',
url: 'https://www.zillow.com/homedetails/115-N-Newton-Ave-Albert-Lea-MN-56007/458158566_zpid/',
source: 'zillow'
},
{
id: '411-court-street',
name: '411 Court Street Albert Lea, MN',
url: 'https://www.zillow.com/homedetails/411-Court-St-Albert-Lea-MN-56007/113924766_zpid/',
source: 'zillow'
},
{
id: '1224-saint-joseph-ave',
name: '1224 Saint Joseph Ave Albert Lea, MN',
url: 'https://www.zillow.com/homedetails/1224-Saint-Joseph-Ave-Albert-Lea-MN-56007/106676968_zpid/',
source: 'zillow'
},
{
id: '15131-800th-ave-glenville',
name: '15131 800th Ave Glenville, MN',
url: 'https://www.zillow.com/homedetails/15131-800th-Ave-Glenville-MN-56036/106671135_zpid/',
source: 'zillow',
status: 'pending'
},
{
id: '1707-sunset-street',
name: '1707 Sunset Street Albert Lea, MN',
url: 'https://www.zillow.com/homedetails/1707-Sunset-St-Albert-Lea-MN-56007/106675985_zpid/',
source: 'zillow',
status: 'sold'
},
{
id: '73341-220th-street',
name: '73341 220th Street Albert Lea, MN',
url: 'https://www.zillow.com/homedetails/73341-220th-St-Albert-Lea-MN-56007/106679658_zpid/',
source: 'zillow',
status: 'sold'
}
];
const OUTPUT_DIR = path.join(__dirname, 'properties');
// Ensure output directory exists
if (!fs.existsSync(OUTPUT_DIR)) {
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}
/**
* Download an image from URL
*/
async function downloadImage(url, filepath) {
return new Promise((resolve, reject) => {
const protocol = url.startsWith('https') ? https : http;
const file = fs.createWriteStream(filepath);
protocol.get(url, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
}, (response) => {
// Handle redirects
if (response.statusCode === 301 || response.statusCode === 302) {
downloadImage(response.headers.location, filepath).then(resolve).catch(reject);
return;
}
if (response.statusCode !== 200) {
reject(new Error(`Failed to download: ${response.statusCode}`));
return;
}
response.pipe(file);
file.on('finish', () => {
file.close();
resolve(filepath);
});
}).on('error', (err) => {
fs.unlink(filepath, () => {});
reject(err);
});
});
}
/**
* Scrape a Zillow property page
*/
async function scrapeZillow(page, property) {
console.log(`Scraping Zillow: ${property.name}`);
await page.goto(property.url, { waitUntil: 'domcontentloaded', timeout: 60000 });
await page.waitForTimeout(3000);
const data = await page.evaluate(() => {
const result = {
address: '',
price: '',
bedrooms: '',
bathrooms: '',
sqft: '',
lotSize: '',
yearBuilt: '',
description: '',
images: [],
details: {}
};
// Get address
const addressEl = document.querySelector('h1[class*="Text"]') ||
document.querySelector('[data-testid="home-details-chip-container"]')?.parentElement?.querySelector('h1');
if (addressEl) result.address = addressEl.textContent.trim();
// Get price
const priceEl = document.querySelector('[data-testid="price"]') ||
document.querySelector('span[data-testid="price"]') ||
document.querySelector('.summary-container span');
if (priceEl) result.price = priceEl.textContent.trim();
// Get bed/bath/sqft from summary
const summaryItems = document.querySelectorAll('[data-testid="bed-bath-beyond"] span, .summary-container span');
summaryItems.forEach(item => {
const text = item.textContent.trim().toLowerCase();
if (text.includes('bd') || text.includes('bed')) {
result.bedrooms = text.replace(/[^\d.]/g, '');
}
if (text.includes('ba') || text.includes('bath')) {
result.bathrooms = text.replace(/[^\d.]/g, '');
}
if (text.includes('sqft') || text.includes('sq ft')) {
result.sqft = text.replace(/[^\d,]/g, '').replace(',', '');
}
});
// Get description
const descEl = document.querySelector('[data-testid="description-text"]') ||
document.querySelector('.ds-overview-section');
if (descEl) result.description = descEl.textContent.trim();
// Get images from photo gallery
const imageEls = document.querySelectorAll('picture source[type="image/webp"], picture img, [data-testid*="photo"] img');
const imageSet = new Set();
imageEls.forEach(el => {
let src = el.srcset || el.src;
if (src) {
// Get highest res from srcset
const srcsetParts = src.split(',');
if (srcsetParts.length > 1) {
src = srcsetParts[srcsetParts.length - 1].trim().split(' ')[0];
}
// Clean up URL
if (src.startsWith('http')) {
imageSet.add(src.split('?')[0]);
}
}
});
result.images = Array.from(imageSet).slice(0, 20);
// Get facts/details
const factRows = document.querySelectorAll('[data-testid*="fact"], .ds-home-fact-list-item');
factRows.forEach(row => {
const label = row.querySelector('[data-testid*="label"], .ds-home-fact-list-item-label');
const value = row.querySelector('[data-testid*="value"], .ds-home-fact-list-item-value');
if (label && value) {
result.details[label.textContent.trim()] = value.textContent.trim();
}
});
return result;
});
// Try to get more images by clicking on photo gallery
try {
const photoBtn = await page.$('[data-testid="media-stream-tile"], .photo-tile, [aria-label*="Photo"]');
if (photoBtn) {
await photoBtn.click();
await page.waitForTimeout(2000);
const moreImages = await page.evaluate(() => {
const imgs = document.querySelectorAll('[data-testid*="lightbox"] img, .media-stream img, .photo-carousel img');
return Array.from(imgs).map(img => img.src).filter(src => src && src.startsWith('http'));
});
moreImages.forEach(img => {
if (!data.images.includes(img)) {
data.images.push(img);
}
});
}
} catch (e) {
console.log(' Could not load additional gallery images');
}
return data;
}
/**
* Scrape a HomeFinder property page
*/
async function scrapeHomefinder(page, property) {
console.log(`Scraping HomeFinder: ${property.name}`);
await page.goto(property.url, { waitUntil: 'networkidle', timeout: 60000 });
await page.waitForTimeout(3000);
const data = await page.evaluate(() => {
const result = {
address: '',
price: '',
bedrooms: '',
bathrooms: '',
sqft: '',
lotSize: '',
yearBuilt: '',
description: '',
images: [],
details: {}
};
// Get address from title or header
const addressEl = document.querySelector('h1, .property-address, [class*="address"]');
if (addressEl) result.address = addressEl.textContent.trim();
// Get price
const priceEl = document.querySelector('.price, [class*="price"], [class*="Price"]');
if (priceEl) result.price = priceEl.textContent.trim();
// Look for property details
const detailItems = document.querySelectorAll('.property-details li, [class*="detail"] span, [class*="spec"]');
detailItems.forEach(item => {
const text = item.textContent.trim().toLowerCase();
if (text.includes('bed')) result.bedrooms = text.replace(/[^\d.]/g, '');
if (text.includes('bath')) result.bathrooms = text.replace(/[^\d.]/g, '');
if (text.includes('sqft') || text.includes('sq ft')) result.sqft = text.replace(/[^\d,]/g, '');
if (text.includes('acre')) result.lotSize = item.textContent.trim();
if (text.includes('built')) result.yearBuilt = text.replace(/[^\d]/g, '');
});
// Get description
const descEl = document.querySelector('.description, [class*="description"], .property-description');
if (descEl) result.description = descEl.textContent.trim();
// Get images
const imageEls = document.querySelectorAll('.gallery img, [class*="photo"] img, .carousel img, picture img');
imageEls.forEach(el => {
if (el.src && el.src.startsWith('http')) {
result.images.push(el.src);
}
});
return result;
});
return data;
}
/**
* Main scraping function
*/
async function main() {
console.log('Starting property scraper...\n');
const browser = await chromium.launch({
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox']
});
const context = await browser.newContext({
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
viewport: { width: 1920, height: 1080 }
});
const allProperties = [];
for (const property of PROPERTIES) {
const propertyDir = path.join(OUTPUT_DIR, property.id);
const imagesDir = path.join(propertyDir, 'images');
// Create directories
if (!fs.existsSync(propertyDir)) {
fs.mkdirSync(propertyDir, { recursive: true });
}
if (!fs.existsSync(imagesDir)) {
fs.mkdirSync(imagesDir, { recursive: true });
}
const page = await context.newPage();
try {
let data;
if (property.source === 'zillow') {
data = await scrapeZillow(page, property);
} else {
data = await scrapeHomefinder(page, property);
}
// Add metadata
data.id = property.id;
data.sourceUrl = property.url;
data.source = property.source;
data.status = property.status || 'active';
data.originalName = property.name;
// Take screenshot
const screenshotPath = path.join(propertyDir, 'screenshot.png');
await page.screenshot({ path: screenshotPath, fullPage: true });
console.log(` Screenshot saved: ${screenshotPath}`);
// Download images
const downloadedImages = [];
for (let i = 0; i < data.images.length && i < 20; i++) {
const imgUrl = data.images[i];
const ext = imgUrl.includes('.png') ? 'png' : 'jpg';
const imgPath = path.join(imagesDir, `image-${String(i + 1).padStart(2, '0')}.${ext}`);
try {
await downloadImage(imgUrl, imgPath);
downloadedImages.push(imgPath);
console.log(` Downloaded image ${i + 1}/${data.images.length}`);
} catch (err) {
console.log(` Failed to download image ${i + 1}: ${err.message}`);
}
}
data.downloadedImages = downloadedImages;
// Save property data
const jsonPath = path.join(propertyDir, 'property.json');
fs.writeFileSync(jsonPath, JSON.stringify(data, null, 2));
console.log(` Data saved: ${jsonPath}\n`);
allProperties.push(data);
} catch (err) {
console.error(` Error scraping ${property.name}: ${err.message}\n`);
}
await page.close();
// Wait between requests
await new Promise(resolve => setTimeout(resolve, 2000));
}
await browser.close();
// Save summary
const summaryPath = path.join(OUTPUT_DIR, 'all-properties.json');
fs.writeFileSync(summaryPath, JSON.stringify(allProperties, null, 2));
console.log(`\nSummary saved to: ${summaryPath}`);
console.log(`Total properties scraped: ${allProperties.length}`);
}
main().catch(console.error);