/** * Property Scraper for HomeProz Real Estate * * Scrapes property listings from homefinder.com and zillow.com * Extracts: name, address, details, description, and images */ const { chromium } = require('playwright'); const fs = require('fs'); const path = require('path'); const https = require('https'); const http = require('http'); // Property listings to scrape const PROPERTIES = [ { id: 'geneva-mn-lot', name: 'Geneva, MN - Double Lot with Shop', url: 'https://homefinder.com/realestateandhomes-detail/Geneva_MN_56035_M93901-25044?from=srp-list-card', source: 'homefinder' }, { id: '115-newton-n-ave', name: '115 Newton N Ave Albert Lea, MN', url: 'https://www.zillow.com/homedetails/115-N-Newton-Ave-Albert-Lea-MN-56007/458158566_zpid/', source: 'zillow' }, { id: '411-court-street', name: '411 Court Street Albert Lea, MN', url: 'https://www.zillow.com/homedetails/411-Court-St-Albert-Lea-MN-56007/113924766_zpid/', source: 'zillow' }, { id: '1224-saint-joseph-ave', name: '1224 Saint Joseph Ave Albert Lea, MN', url: 'https://www.zillow.com/homedetails/1224-Saint-Joseph-Ave-Albert-Lea-MN-56007/106676968_zpid/', source: 'zillow' }, { id: '15131-800th-ave-glenville', name: '15131 800th Ave Glenville, MN', url: 'https://www.zillow.com/homedetails/15131-800th-Ave-Glenville-MN-56036/106671135_zpid/', source: 'zillow', status: 'pending' }, { id: '1707-sunset-street', name: '1707 Sunset Street Albert Lea, MN', url: 'https://www.zillow.com/homedetails/1707-Sunset-St-Albert-Lea-MN-56007/106675985_zpid/', source: 'zillow', status: 'sold' }, { id: '73341-220th-street', name: '73341 220th Street Albert Lea, MN', url: 'https://www.zillow.com/homedetails/73341-220th-St-Albert-Lea-MN-56007/106679658_zpid/', source: 'zillow', status: 'sold' } ]; const OUTPUT_DIR = path.join(__dirname, 'properties'); // Ensure output directory exists if (!fs.existsSync(OUTPUT_DIR)) { fs.mkdirSync(OUTPUT_DIR, { recursive: true }); } /** * Download an image from URL */ async function downloadImage(url, filepath) { return new Promise((resolve, reject) => { const protocol = url.startsWith('https') ? https : http; const file = fs.createWriteStream(filepath); protocol.get(url, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' } }, (response) => { // Handle redirects if (response.statusCode === 301 || response.statusCode === 302) { downloadImage(response.headers.location, filepath).then(resolve).catch(reject); return; } if (response.statusCode !== 200) { reject(new Error(`Failed to download: ${response.statusCode}`)); return; } response.pipe(file); file.on('finish', () => { file.close(); resolve(filepath); }); }).on('error', (err) => { fs.unlink(filepath, () => {}); reject(err); }); }); } /** * Scrape a Zillow property page */ async function scrapeZillow(page, property) { console.log(`Scraping Zillow: ${property.name}`); await page.goto(property.url, { waitUntil: 'domcontentloaded', timeout: 60000 }); await page.waitForTimeout(3000); const data = await page.evaluate(() => { const result = { address: '', price: '', bedrooms: '', bathrooms: '', sqft: '', lotSize: '', yearBuilt: '', description: '', images: [], details: {} }; // Get address const addressEl = document.querySelector('h1[class*="Text"]') || document.querySelector('[data-testid="home-details-chip-container"]')?.parentElement?.querySelector('h1'); if (addressEl) result.address = addressEl.textContent.trim(); // Get price const priceEl = document.querySelector('[data-testid="price"]') || document.querySelector('span[data-testid="price"]') || document.querySelector('.summary-container span'); if (priceEl) result.price = priceEl.textContent.trim(); // Get bed/bath/sqft from summary const summaryItems = document.querySelectorAll('[data-testid="bed-bath-beyond"] span, .summary-container span'); summaryItems.forEach(item => { const text = item.textContent.trim().toLowerCase(); if (text.includes('bd') || text.includes('bed')) { result.bedrooms = text.replace(/[^\d.]/g, ''); } if (text.includes('ba') || text.includes('bath')) { result.bathrooms = text.replace(/[^\d.]/g, ''); } if (text.includes('sqft') || text.includes('sq ft')) { result.sqft = text.replace(/[^\d,]/g, '').replace(',', ''); } }); // Get description const descEl = document.querySelector('[data-testid="description-text"]') || document.querySelector('.ds-overview-section'); if (descEl) result.description = descEl.textContent.trim(); // Get images from photo gallery const imageEls = document.querySelectorAll('picture source[type="image/webp"], picture img, [data-testid*="photo"] img'); const imageSet = new Set(); imageEls.forEach(el => { let src = el.srcset || el.src; if (src) { // Get highest res from srcset const srcsetParts = src.split(','); if (srcsetParts.length > 1) { src = srcsetParts[srcsetParts.length - 1].trim().split(' ')[0]; } // Clean up URL if (src.startsWith('http')) { imageSet.add(src.split('?')[0]); } } }); result.images = Array.from(imageSet).slice(0, 20); // Get facts/details const factRows = document.querySelectorAll('[data-testid*="fact"], .ds-home-fact-list-item'); factRows.forEach(row => { const label = row.querySelector('[data-testid*="label"], .ds-home-fact-list-item-label'); const value = row.querySelector('[data-testid*="value"], .ds-home-fact-list-item-value'); if (label && value) { result.details[label.textContent.trim()] = value.textContent.trim(); } }); return result; }); // Try to get more images by clicking on photo gallery try { const photoBtn = await page.$('[data-testid="media-stream-tile"], .photo-tile, [aria-label*="Photo"]'); if (photoBtn) { await photoBtn.click(); await page.waitForTimeout(2000); const moreImages = await page.evaluate(() => { const imgs = document.querySelectorAll('[data-testid*="lightbox"] img, .media-stream img, .photo-carousel img'); return Array.from(imgs).map(img => img.src).filter(src => src && src.startsWith('http')); }); moreImages.forEach(img => { if (!data.images.includes(img)) { data.images.push(img); } }); } } catch (e) { console.log(' Could not load additional gallery images'); } return data; } /** * Scrape a HomeFinder property page */ async function scrapeHomefinder(page, property) { console.log(`Scraping HomeFinder: ${property.name}`); await page.goto(property.url, { waitUntil: 'networkidle', timeout: 60000 }); await page.waitForTimeout(3000); const data = await page.evaluate(() => { const result = { address: '', price: '', bedrooms: '', bathrooms: '', sqft: '', lotSize: '', yearBuilt: '', description: '', images: [], details: {} }; // Get address from title or header const addressEl = document.querySelector('h1, .property-address, [class*="address"]'); if (addressEl) result.address = addressEl.textContent.trim(); // Get price const priceEl = document.querySelector('.price, [class*="price"], [class*="Price"]'); if (priceEl) result.price = priceEl.textContent.trim(); // Look for property details const detailItems = document.querySelectorAll('.property-details li, [class*="detail"] span, [class*="spec"]'); detailItems.forEach(item => { const text = item.textContent.trim().toLowerCase(); if (text.includes('bed')) result.bedrooms = text.replace(/[^\d.]/g, ''); if (text.includes('bath')) result.bathrooms = text.replace(/[^\d.]/g, ''); if (text.includes('sqft') || text.includes('sq ft')) result.sqft = text.replace(/[^\d,]/g, ''); if (text.includes('acre')) result.lotSize = item.textContent.trim(); if (text.includes('built')) result.yearBuilt = text.replace(/[^\d]/g, ''); }); // Get description const descEl = document.querySelector('.description, [class*="description"], .property-description'); if (descEl) result.description = descEl.textContent.trim(); // Get images const imageEls = document.querySelectorAll('.gallery img, [class*="photo"] img, .carousel img, picture img'); imageEls.forEach(el => { if (el.src && el.src.startsWith('http')) { result.images.push(el.src); } }); return result; }); return data; } /** * Main scraping function */ async function main() { console.log('Starting property scraper...\n'); const browser = await chromium.launch({ headless: true, args: ['--no-sandbox', '--disable-setuid-sandbox'] }); const context = await browser.newContext({ userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', viewport: { width: 1920, height: 1080 } }); const allProperties = []; for (const property of PROPERTIES) { const propertyDir = path.join(OUTPUT_DIR, property.id); const imagesDir = path.join(propertyDir, 'images'); // Create directories if (!fs.existsSync(propertyDir)) { fs.mkdirSync(propertyDir, { recursive: true }); } if (!fs.existsSync(imagesDir)) { fs.mkdirSync(imagesDir, { recursive: true }); } const page = await context.newPage(); try { let data; if (property.source === 'zillow') { data = await scrapeZillow(page, property); } else { data = await scrapeHomefinder(page, property); } // Add metadata data.id = property.id; data.sourceUrl = property.url; data.source = property.source; data.status = property.status || 'active'; data.originalName = property.name; // Take screenshot const screenshotPath = path.join(propertyDir, 'screenshot.png'); await page.screenshot({ path: screenshotPath, fullPage: true }); console.log(` Screenshot saved: ${screenshotPath}`); // Download images const downloadedImages = []; for (let i = 0; i < data.images.length && i < 20; i++) { const imgUrl = data.images[i]; const ext = imgUrl.includes('.png') ? 'png' : 'jpg'; const imgPath = path.join(imagesDir, `image-${String(i + 1).padStart(2, '0')}.${ext}`); try { await downloadImage(imgUrl, imgPath); downloadedImages.push(imgPath); console.log(` Downloaded image ${i + 1}/${data.images.length}`); } catch (err) { console.log(` Failed to download image ${i + 1}: ${err.message}`); } } data.downloadedImages = downloadedImages; // Save property data const jsonPath = path.join(propertyDir, 'property.json'); fs.writeFileSync(jsonPath, JSON.stringify(data, null, 2)); console.log(` Data saved: ${jsonPath}\n`); allProperties.push(data); } catch (err) { console.error(` Error scraping ${property.name}: ${err.message}\n`); } await page.close(); // Wait between requests await new Promise(resolve => setTimeout(resolve, 2000)); } await browser.close(); // Save summary const summaryPath = path.join(OUTPUT_DIR, 'all-properties.json'); fs.writeFileSync(summaryPath, JSON.stringify(allProperties, null, 2)); console.log(`\nSummary saved to: ${summaryPath}`); console.log(`Total properties scraped: ${allProperties.length}`); } main().catch(console.error);