Cleaning Scraped Data: Remove Noise & Normalize Output
Data cleaning in web scraping is the process of transforming raw, messy extracted data into consistent, accurate, and usable structured data. This includes removing whitespace, parsing values, handling missing data, and normalizing formats.
Why Scraped Data Needs Cleaning
Raw scraped data is almost always messy:
- •Extra whitespace and newlines:
" \n Widget Pro \n " - •Prices as strings:
"$1,299.99","USD 1299.99","1.299,99 €" - •Inconsistent dates:
"March 5, 2026","2026-03-05","05/03/2026" - •HTML artifacts:
"Widget & Co" - •Missing values: empty strings, "N/A", "null"
Essential Cleaning Functions
import re
from datetime import datetime
def clean_text(text: str) -> str:
"""Remove whitespace, newlines, and normalize spaces"""
return " ".join(text.split()).strip()
def clean_price(text: str) -> float | None:
"""Extract numeric price from any format"""
match = re.search(r'[\d,.]+', text.replace(',', ''))
return float(match.group()) if match else None
def clean_date(text: str) -> str | None:
"""Parse various date formats to ISO format"""
formats = ["%B %d, %Y", "%Y-%m-%d", "%m/%d/%Y", "%d %b %Y"]
for fmt in formats:
try:
return datetime.strptime(text.strip(), fmt).date().isoformat()
except ValueError:
continue
return None
def clean_url(url: str, base: str) -> str:
"""Convert relative URLs to absolute"""
from urllib.parse import urljoin
return urljoin(base, url.strip())
Handling Missing Data
def clean_field(value: str | None, default: str = "") -> str:
if value is None:
return default
cleaned = clean_text(value)
if cleaned.lower() in ("n/a", "null", "none", "-", ""):
return default
return cleaned
Deduplication
# Simple deduplication by URL
seen_urls = set()
unique_products = []
for product in products:
if product["url"] not in seen_urls:
seen_urls.add(product["url"])
unique_products.append(product)
Validation
def validate_product(product: dict) -> list[str]:
errors = []
if not product.get("name"):
errors.append("Missing name")
if not product.get("price") or product["price"] <= 0:
errors.append("Invalid price")
if not product.get("url", "").startswith("http"):
errors.append("Invalid URL")
return errors
Pro Tip: Clean During Extraction
Don't scrape everything raw and clean later. Apply basic cleaning as you extract:
name = clean_text(element.select_one(".title").text)
price = clean_price(element.select_one(".price").text)