775c57a678
- Remove status dropdown (always show all properties) - Remove sort dropdown (use status-based sorting) - Sort order: Active > Pending > Sold, then by modified date - Map view: half height, 2-column property grid - Beds field same width as others - Add CLAUDE.md documentation for property system 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
375 lines
13 KiB
JavaScript
375 lines
13 KiB
JavaScript
/**
|
|
* Property Scraper for HomeProz Real Estate
|
|
*
|
|
* Scrapes property listings from homefinder.com and zillow.com
|
|
* Extracts: name, address, details, description, and images
|
|
*/
|
|
|
|
const { chromium } = require('playwright');
|
|
const fs = require('fs');
|
|
const path = require('path');
|
|
const https = require('https');
|
|
const http = require('http');
|
|
|
|
// Property listings to scrape
|
|
const PROPERTIES = [
|
|
{
|
|
id: 'geneva-mn-lot',
|
|
name: 'Geneva, MN - Double Lot with Shop',
|
|
url: 'https://homefinder.com/realestateandhomes-detail/Geneva_MN_56035_M93901-25044?from=srp-list-card',
|
|
source: 'homefinder'
|
|
},
|
|
{
|
|
id: '115-newton-n-ave',
|
|
name: '115 Newton N Ave Albert Lea, MN',
|
|
url: 'https://www.zillow.com/homedetails/115-N-Newton-Ave-Albert-Lea-MN-56007/458158566_zpid/',
|
|
source: 'zillow'
|
|
},
|
|
{
|
|
id: '411-court-street',
|
|
name: '411 Court Street Albert Lea, MN',
|
|
url: 'https://www.zillow.com/homedetails/411-Court-St-Albert-Lea-MN-56007/113924766_zpid/',
|
|
source: 'zillow'
|
|
},
|
|
{
|
|
id: '1224-saint-joseph-ave',
|
|
name: '1224 Saint Joseph Ave Albert Lea, MN',
|
|
url: 'https://www.zillow.com/homedetails/1224-Saint-Joseph-Ave-Albert-Lea-MN-56007/106676968_zpid/',
|
|
source: 'zillow'
|
|
},
|
|
{
|
|
id: '15131-800th-ave-glenville',
|
|
name: '15131 800th Ave Glenville, MN',
|
|
url: 'https://www.zillow.com/homedetails/15131-800th-Ave-Glenville-MN-56036/106671135_zpid/',
|
|
source: 'zillow',
|
|
status: 'pending'
|
|
},
|
|
{
|
|
id: '1707-sunset-street',
|
|
name: '1707 Sunset Street Albert Lea, MN',
|
|
url: 'https://www.zillow.com/homedetails/1707-Sunset-St-Albert-Lea-MN-56007/106675985_zpid/',
|
|
source: 'zillow',
|
|
status: 'sold'
|
|
},
|
|
{
|
|
id: '73341-220th-street',
|
|
name: '73341 220th Street Albert Lea, MN',
|
|
url: 'https://www.zillow.com/homedetails/73341-220th-St-Albert-Lea-MN-56007/106679658_zpid/',
|
|
source: 'zillow',
|
|
status: 'sold'
|
|
}
|
|
];
|
|
|
|
const OUTPUT_DIR = path.join(__dirname, 'properties');
|
|
|
|
// Ensure output directory exists
|
|
if (!fs.existsSync(OUTPUT_DIR)) {
|
|
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
|
}
|
|
|
|
/**
|
|
* Download an image from URL
|
|
*/
|
|
async function downloadImage(url, filepath) {
|
|
return new Promise((resolve, reject) => {
|
|
const protocol = url.startsWith('https') ? https : http;
|
|
const file = fs.createWriteStream(filepath);
|
|
|
|
protocol.get(url, {
|
|
headers: {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
|
}
|
|
}, (response) => {
|
|
// Handle redirects
|
|
if (response.statusCode === 301 || response.statusCode === 302) {
|
|
downloadImage(response.headers.location, filepath).then(resolve).catch(reject);
|
|
return;
|
|
}
|
|
|
|
if (response.statusCode !== 200) {
|
|
reject(new Error(`Failed to download: ${response.statusCode}`));
|
|
return;
|
|
}
|
|
|
|
response.pipe(file);
|
|
file.on('finish', () => {
|
|
file.close();
|
|
resolve(filepath);
|
|
});
|
|
}).on('error', (err) => {
|
|
fs.unlink(filepath, () => {});
|
|
reject(err);
|
|
});
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Scrape a Zillow property page
|
|
*/
|
|
async function scrapeZillow(page, property) {
|
|
console.log(`Scraping Zillow: ${property.name}`);
|
|
|
|
await page.goto(property.url, { waitUntil: 'domcontentloaded', timeout: 60000 });
|
|
await page.waitForTimeout(3000);
|
|
|
|
const data = await page.evaluate(() => {
|
|
const result = {
|
|
address: '',
|
|
price: '',
|
|
bedrooms: '',
|
|
bathrooms: '',
|
|
sqft: '',
|
|
lotSize: '',
|
|
yearBuilt: '',
|
|
description: '',
|
|
images: [],
|
|
details: {}
|
|
};
|
|
|
|
// Get address
|
|
const addressEl = document.querySelector('h1[class*="Text"]') ||
|
|
document.querySelector('[data-testid="home-details-chip-container"]')?.parentElement?.querySelector('h1');
|
|
if (addressEl) result.address = addressEl.textContent.trim();
|
|
|
|
// Get price
|
|
const priceEl = document.querySelector('[data-testid="price"]') ||
|
|
document.querySelector('span[data-testid="price"]') ||
|
|
document.querySelector('.summary-container span');
|
|
if (priceEl) result.price = priceEl.textContent.trim();
|
|
|
|
// Get bed/bath/sqft from summary
|
|
const summaryItems = document.querySelectorAll('[data-testid="bed-bath-beyond"] span, .summary-container span');
|
|
summaryItems.forEach(item => {
|
|
const text = item.textContent.trim().toLowerCase();
|
|
if (text.includes('bd') || text.includes('bed')) {
|
|
result.bedrooms = text.replace(/[^\d.]/g, '');
|
|
}
|
|
if (text.includes('ba') || text.includes('bath')) {
|
|
result.bathrooms = text.replace(/[^\d.]/g, '');
|
|
}
|
|
if (text.includes('sqft') || text.includes('sq ft')) {
|
|
result.sqft = text.replace(/[^\d,]/g, '').replace(',', '');
|
|
}
|
|
});
|
|
|
|
// Get description
|
|
const descEl = document.querySelector('[data-testid="description-text"]') ||
|
|
document.querySelector('.ds-overview-section');
|
|
if (descEl) result.description = descEl.textContent.trim();
|
|
|
|
// Get images from photo gallery
|
|
const imageEls = document.querySelectorAll('picture source[type="image/webp"], picture img, [data-testid*="photo"] img');
|
|
const imageSet = new Set();
|
|
imageEls.forEach(el => {
|
|
let src = el.srcset || el.src;
|
|
if (src) {
|
|
// Get highest res from srcset
|
|
const srcsetParts = src.split(',');
|
|
if (srcsetParts.length > 1) {
|
|
src = srcsetParts[srcsetParts.length - 1].trim().split(' ')[0];
|
|
}
|
|
// Clean up URL
|
|
if (src.startsWith('http')) {
|
|
imageSet.add(src.split('?')[0]);
|
|
}
|
|
}
|
|
});
|
|
result.images = Array.from(imageSet).slice(0, 20);
|
|
|
|
// Get facts/details
|
|
const factRows = document.querySelectorAll('[data-testid*="fact"], .ds-home-fact-list-item');
|
|
factRows.forEach(row => {
|
|
const label = row.querySelector('[data-testid*="label"], .ds-home-fact-list-item-label');
|
|
const value = row.querySelector('[data-testid*="value"], .ds-home-fact-list-item-value');
|
|
if (label && value) {
|
|
result.details[label.textContent.trim()] = value.textContent.trim();
|
|
}
|
|
});
|
|
|
|
return result;
|
|
});
|
|
|
|
// Try to get more images by clicking on photo gallery
|
|
try {
|
|
const photoBtn = await page.$('[data-testid="media-stream-tile"], .photo-tile, [aria-label*="Photo"]');
|
|
if (photoBtn) {
|
|
await photoBtn.click();
|
|
await page.waitForTimeout(2000);
|
|
|
|
const moreImages = await page.evaluate(() => {
|
|
const imgs = document.querySelectorAll('[data-testid*="lightbox"] img, .media-stream img, .photo-carousel img');
|
|
return Array.from(imgs).map(img => img.src).filter(src => src && src.startsWith('http'));
|
|
});
|
|
|
|
moreImages.forEach(img => {
|
|
if (!data.images.includes(img)) {
|
|
data.images.push(img);
|
|
}
|
|
});
|
|
}
|
|
} catch (e) {
|
|
console.log(' Could not load additional gallery images');
|
|
}
|
|
|
|
return data;
|
|
}
|
|
|
|
/**
|
|
* Scrape a HomeFinder property page
|
|
*/
|
|
async function scrapeHomefinder(page, property) {
|
|
console.log(`Scraping HomeFinder: ${property.name}`);
|
|
|
|
await page.goto(property.url, { waitUntil: 'networkidle', timeout: 60000 });
|
|
await page.waitForTimeout(3000);
|
|
|
|
const data = await page.evaluate(() => {
|
|
const result = {
|
|
address: '',
|
|
price: '',
|
|
bedrooms: '',
|
|
bathrooms: '',
|
|
sqft: '',
|
|
lotSize: '',
|
|
yearBuilt: '',
|
|
description: '',
|
|
images: [],
|
|
details: {}
|
|
};
|
|
|
|
// Get address from title or header
|
|
const addressEl = document.querySelector('h1, .property-address, [class*="address"]');
|
|
if (addressEl) result.address = addressEl.textContent.trim();
|
|
|
|
// Get price
|
|
const priceEl = document.querySelector('.price, [class*="price"], [class*="Price"]');
|
|
if (priceEl) result.price = priceEl.textContent.trim();
|
|
|
|
// Look for property details
|
|
const detailItems = document.querySelectorAll('.property-details li, [class*="detail"] span, [class*="spec"]');
|
|
detailItems.forEach(item => {
|
|
const text = item.textContent.trim().toLowerCase();
|
|
if (text.includes('bed')) result.bedrooms = text.replace(/[^\d.]/g, '');
|
|
if (text.includes('bath')) result.bathrooms = text.replace(/[^\d.]/g, '');
|
|
if (text.includes('sqft') || text.includes('sq ft')) result.sqft = text.replace(/[^\d,]/g, '');
|
|
if (text.includes('acre')) result.lotSize = item.textContent.trim();
|
|
if (text.includes('built')) result.yearBuilt = text.replace(/[^\d]/g, '');
|
|
});
|
|
|
|
// Get description
|
|
const descEl = document.querySelector('.description, [class*="description"], .property-description');
|
|
if (descEl) result.description = descEl.textContent.trim();
|
|
|
|
// Get images
|
|
const imageEls = document.querySelectorAll('.gallery img, [class*="photo"] img, .carousel img, picture img');
|
|
imageEls.forEach(el => {
|
|
if (el.src && el.src.startsWith('http')) {
|
|
result.images.push(el.src);
|
|
}
|
|
});
|
|
|
|
return result;
|
|
});
|
|
|
|
return data;
|
|
}
|
|
|
|
/**
|
|
* Main scraping function
|
|
*/
|
|
async function main() {
|
|
console.log('Starting property scraper...\n');
|
|
|
|
const browser = await chromium.launch({
|
|
headless: true,
|
|
args: ['--no-sandbox', '--disable-setuid-sandbox']
|
|
});
|
|
|
|
const context = await browser.newContext({
|
|
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
viewport: { width: 1920, height: 1080 }
|
|
});
|
|
|
|
const allProperties = [];
|
|
|
|
for (const property of PROPERTIES) {
|
|
const propertyDir = path.join(OUTPUT_DIR, property.id);
|
|
const imagesDir = path.join(propertyDir, 'images');
|
|
|
|
// Create directories
|
|
if (!fs.existsSync(propertyDir)) {
|
|
fs.mkdirSync(propertyDir, { recursive: true });
|
|
}
|
|
if (!fs.existsSync(imagesDir)) {
|
|
fs.mkdirSync(imagesDir, { recursive: true });
|
|
}
|
|
|
|
const page = await context.newPage();
|
|
|
|
try {
|
|
let data;
|
|
|
|
if (property.source === 'zillow') {
|
|
data = await scrapeZillow(page, property);
|
|
} else {
|
|
data = await scrapeHomefinder(page, property);
|
|
}
|
|
|
|
// Add metadata
|
|
data.id = property.id;
|
|
data.sourceUrl = property.url;
|
|
data.source = property.source;
|
|
data.status = property.status || 'active';
|
|
data.originalName = property.name;
|
|
|
|
// Take screenshot
|
|
const screenshotPath = path.join(propertyDir, 'screenshot.png');
|
|
await page.screenshot({ path: screenshotPath, fullPage: true });
|
|
console.log(` Screenshot saved: ${screenshotPath}`);
|
|
|
|
// Download images
|
|
const downloadedImages = [];
|
|
for (let i = 0; i < data.images.length && i < 20; i++) {
|
|
const imgUrl = data.images[i];
|
|
const ext = imgUrl.includes('.png') ? 'png' : 'jpg';
|
|
const imgPath = path.join(imagesDir, `image-${String(i + 1).padStart(2, '0')}.${ext}`);
|
|
|
|
try {
|
|
await downloadImage(imgUrl, imgPath);
|
|
downloadedImages.push(imgPath);
|
|
console.log(` Downloaded image ${i + 1}/${data.images.length}`);
|
|
} catch (err) {
|
|
console.log(` Failed to download image ${i + 1}: ${err.message}`);
|
|
}
|
|
}
|
|
|
|
data.downloadedImages = downloadedImages;
|
|
|
|
// Save property data
|
|
const jsonPath = path.join(propertyDir, 'property.json');
|
|
fs.writeFileSync(jsonPath, JSON.stringify(data, null, 2));
|
|
console.log(` Data saved: ${jsonPath}\n`);
|
|
|
|
allProperties.push(data);
|
|
|
|
} catch (err) {
|
|
console.error(` Error scraping ${property.name}: ${err.message}\n`);
|
|
}
|
|
|
|
await page.close();
|
|
|
|
// Wait between requests
|
|
await new Promise(resolve => setTimeout(resolve, 2000));
|
|
}
|
|
|
|
await browser.close();
|
|
|
|
// Save summary
|
|
const summaryPath = path.join(OUTPUT_DIR, 'all-properties.json');
|
|
fs.writeFileSync(summaryPath, JSON.stringify(allProperties, null, 2));
|
|
console.log(`\nSummary saved to: ${summaryPath}`);
|
|
console.log(`Total properties scraped: ${allProperties.length}`);
|
|
}
|
|
|
|
main().catch(console.error);
|