How to Scrape E-Commerce Product Data with Python
E-commerce sites are one of the most common scraping targets. Price monitoring, competitive analysis, product research — it all starts with extracting product data reliably.
This guide walks through scraping product information from e-commerce sites, from inspecting the page to storing structured data.
What Data to Extract
A typical product scrape collects:
- •Product name and description
- •Price (current, original, discount percentage)
- •Ratings and review counts
- •Images (URLs, not the files themselves)
- •Specifications (size, weight, material, etc.)
- •Availability (in stock, out of stock)
- •SKU or product ID (for deduplication)
product = {
"name": "",
"price": 0.0,
"original_price": 0.0,
"rating": 0.0,
"review_count": 0,
"image_url": "",
"specs": {},
"in_stock": True,
"url": "",
}
Inspecting Site Structure with DevTools
Before writing any code, spend five minutes in Chrome DevTools.
- 1.Right-click on a product name and select "Inspect"
- 2.Note the element tag and class names (e.g.,
) - 3.Check the Network tab — does the page load data via an API call?
- 4.Look at multiple products to confirm the structure is consistent
Building the Scraper Step by Step
import requests
from bs4 import BeautifulSoup
import time
import json
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
}
def scrape_product_page(url):
"""Scrape a single product page and return structured data."""
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, "lxml")
# Extract product details
product = {
"url": url,
"name": extract_text(soup, "h1.product-title"),
"price": extract_price(soup, ".current-price"),
"original_price": extract_price(soup, ".original-price"),
"rating": extract_float(soup, ".star-rating"),
"review_count": extract_int(soup, ".review-count"),
"image_url": extract_attr(soup, ".product-image img", "src"),
"in_stock": "out of stock" not in soup.get_text().lower(),
}
return product
def extract_text(soup, selector):
el = soup.select_one(selector)
return el.get_text(strip=True) if el else ""
def extract_price(soup, selector):
el = soup.select_one(selector)
if not el:
return 0.0
text = el.get_text(strip=True)
# Remove currency symbols and parse
cleaned = text.replace("$", "").replace(",", "").strip()
try:
return float(cleaned)
except ValueError:
return 0.0
def extract_float(soup, selector):
el = soup.select_one(selector)
if not el:
return 0.0
try:
return float(el.get_text(strip=True))
except ValueError:
return 0.0
def extract_int(soup, selector):
el = soup.select_one(selector)
if not el:
return 0
text = el.get_text(strip=True).replace(",", "")
digits = "".join(c for c in text if c.isdigit())
return int(digits) if digits else 0
def extract_attr(soup, selector, attr):
el = soup.select_one(selector)
return el.get(attr, "") if el else ""
Helper functions keep the main scraping logic clean and handle missing elements without crashing.
Handling Product Variants
Products often have multiple variants — different sizes, colors, or configurations. These are usually loaded via JavaScript or hidden in the page source.
import json
import re
def extract_variants(soup):
"""Extract variant data from embedded JSON in the page."""
# Many e-commerce sites embed product data in a script tag
scripts = soup.select("script")
for script in scripts:
text = script.string or ""
if "variants" in text or "productData" in text:
# Try to extract JSON from the script
match = re.search(r'productDatas*=s*({.*?});', text, re.DOTALL)
if match:
data = json.loads(match.group(1))
return data.get("variants", [])
return []
Look for tags containing application/ld+json structured data too. Many sites include product schema markup that's easy to parse.
def extract_structured_data(soup):
"""Extract product data from JSON-LD structured data."""
for script in soup.select('script[type="application/ld+json"]'):
try:
data = json.loads(script.string)
if data.get("@type") == "Product":
return {
"name": data.get("name"),
"price": data.get("offers", {}).get("price"),
"currency": data.get("offers", {}).get("priceCurrency"),
"rating": data.get("aggregateRating", {}).get("ratingValue"),
}
except json.JSONDecodeError:
continue
return None
Pagination Across Category Pages
Most e-commerce scraping starts with category pages, then visits each product URL.
def scrape_category(category_url, max_pages=20):
"""Scrape all product URLs from a category, handling pagination."""
product_urls = []
for page in range(1, max_pages + 1):
url = f"{category_url}?page={page}"
response = requests.get(url, headers=headers, timeout=10)
soup = BeautifulSoup(response.text, "lxml")
links = soup.select("a.product-link")
if not links:
break
for link in links:
href = link.get("href", "")
if href.startswith("/"):
href = "https://example.com" + href
product_urls.append(href)
print(f"Page {page}: found {len(links)} products")
time.sleep(1) # Respect the server
return list(set(product_urls)) # Deduplicate
# Scrape category, then each product
product_urls = scrape_category("https://example.com/category/electronics")
products = []
for url in product_urls:
product = scrape_product_page(url)
products.append(product)
time.sleep(1)
Storing Data in CSV and JSON
CSV Output
import csv
def save_to_csv(products, filename="products.csv"):
if not products:
return
fieldnames = products[0].keys()
with open(filename, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(products)
print(f"Saved {len(products)} products to {filename}")
JSON Output
def save_to_json(products, filename="products.json"):
with open(filename, "w", encoding="utf-8") as f:
json.dump(products, f, indent=2, ensure_ascii=False)
print(f"Saved {len(products)} products to {filename}")
JSON preserves nested data (like specs and variants) better than CSV.
Dealing with Anti-Bot Protection
E-commerce sites invest heavily in anti-bot systems. Here's what helps:
import random
import time
# Rotate user agents
user_agents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/125.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) Chrome/125.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:128.0) Gecko/20100101 Firefox/128.0",
]
# Random delays between requests (1-3 seconds)
time.sleep(random.uniform(1, 3))
# Use sessions to maintain cookies
session = requests.Session()
session.headers.update({"User-Agent": random.choice(user_agents)})
For heavily protected sites, you may need residential proxies and Playwright. Check the site's protections before building your scraper — it saves time.
Building a Price Monitoring Script
Once you can scrape product data, price monitoring is straightforward:
import json
import os
from datetime import datetime
def monitor_price(product_url, data_file="price_history.json"):
"""Scrape current price and append to history."""
product = scrape_product_page(product_url)
# Load existing history
if os.path.exists(data_file):
with open(data_file) as f:
history = json.load(f)
else:
history = {}
# Add new price point
product_id = product_url
if product_id not in history:
history[product_id] = {"name": product["name"], "prices": []}
history[product_id]["prices"].append({
"price": product["price"],
"date": datetime.now().isoformat(),
})
# Save updated history
with open(data_file, "w") as f:
json.dump(history, f, indent=2)
print(f"{product['name']}: ${product['price']}")
Run this on a schedule (cron job or task scheduler) and you've got a basic price tracker.
Ethical Considerations
E-commerce scraping sits in a gray area. Stay on the right side:
- •Check robots.txt before scraping any site
- •Rate limit your requests — one per second is a good baseline
- •Don't scrape personal data (reviews with usernames may qualify)
- •Cache results — don't re-scrape pages you already have
- •Respect the site's Terms of Service when using the data commercially
- •Don't hammer the site during peak hours
What's Next
E-commerce scraping is one of the most practical applications of web scraping. From here, you can build price comparison tools, market research datasets, and automated monitoring systems.
The Master Web Scraping course includes a complete e-commerce scraping project where you build a production-ready price monitor from scratch.