Web Scraping Project: Company Website Product Scraper


Let's build a scraper for a hypothetical company website (we'll use example-company.com
) to extract product information. I'll explain each part in detail and provide a complete project structure.
Project Structure
company_scraper/
│
├── company_scraper/
│ ├── __init__.py
│ ├── scraper.py
│ └── exceptions.py
│
├── tests/
│ ├── __init__.py
│ └── test_scraper.py
│
├── setup.py
├── requirements.txt
├── README.md
└── .gitignore
Step-by-Step Implementation
1. scraper.py (Core Scraping Logic)
import requests
from bs4 import BeautifulSoup
from .exceptions import ScraperError
class CompanyScraper:
"""
Scrapes product data from example-company.com
"""
BASE_URL = "https://www.example-company.com/products"
def __init__(self):
# Set up a session with custom headers
self.session = requests.Session()
self.session.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
"Accept-Language": "en-US,en;q=0.9"
})
def get_products(self, max_products=10):
"""
Extract product information from the company website
Returns list of dictionaries with product details
"""
try:
# 1. Fetch the page
response = self.session.get(self.BASE_URL)
response.raise_for_status() # Raise HTTP errors
# 2. Parse HTML content
soup = BeautifulSoup(response.text, "html.parser")
# 3. Find product containers - adjust selector based on actual site
product_cards = soup.select(".product-card")[:max_products]
products = []
# 4. Extract data from each product
for card in product_cards:
product = {
"name": self._get_text(card, ".product-name"),
"price": self._get_text(card, ".price"),
"description": self._get_text(card, ".description"),
"url": self._get_attribute(card, "a.product-link", "href")
}
products.append(product)
return products
except Exception as e:
raise ScraperError(f"Failed to scrape products: {str(e)}")
def _get_text(self, element, selector):
"""Helper to safely extract text"""
found = element.select_one(selector)
return found.text.strip() if found else None
def _get_attribute(self, element, selector, attr):
"""Helper to safely extract attributes"""
found = element.select_one(selector)
return found[attr] if found and attr in found.attrs else None
Imports:
requests
for HTTP requestsBeautifulSoup
for HTML parsingCustom
ScraperError
for error handling
Class Initialization:
BASE_URL
: The target products page__init__
: Sets up a session with headers to mimic a browser
get_products() Method:
Takes
max_products
parameter to limit resultsMakes HTTP GET request to products page
Uses
raise_for_status()
to catch HTTP errorsParses HTML with BeautifulSoup
Finds product cards using CSS selector (adjust based on real site)
Loops through products to extract:
Name (from
.product-name
element)Price (from
.price
element)Description (from
.description
element)URL (href attribute from product link)
Helper Methods:
_get_text()
: Safely extracts text from an element_get_attribute()
: Safely extracts attribute values
2. exceptions.py (Error Handling)
class ScraperError(Exception):
"""Base exception for scraping errors"""
class HTTPError(ScraperError):
"""Raised when HTTP request fails"""
class ParseError(ScraperError):
"""Raised when content parsing fails"""
3. init.py (Package Exports)
from .scraper import CompanyScraper
__all__ = ["CompanyScraper"]
Testing the Scraper
test_scraper.py
import pytest
from unittest.mock import patch
from company_scraper import CompanyScraper
@patch("company_scraper.scraper.requests.Session")
def test_get_products(mock_session):
# Setup mock response
mock_response = mock_session.return_value.get.return_value
mock_response.text = """
<html>
<div class="product-card">
<h3 class="product-name">Test Product</h3>
<span class="price">$19.99</span>
<p class="description">Test description</p>
<a class="product-link" href="/products/1"></a>
</div>
</html>
"""
mock_response.raise_for_status.return_value = None
# Test the scraper
scraper = CompanyScraper()
products = scraper.get_products()
assert len(products) == 1
assert products[0]["name"] == "Test Product"
assert products[0]["price"] == "$19.99"
Packaging for Distribution
setup.py
from setuptools import setup, find_packages
setup(
name="company_scraper",
version="0.1",
packages=find_packages(),
install_requires=[
"requests>=2.26.0",
"beautifulsoup4>=4.10.0"
],
python_requires=">=3.7",
)
Install Locally
pip install -e .
Usage Example
from company_scraper import CompanyScraper
scraper = CompanyScraper()
products = scraper.get_products(max_products=5)
for product in products:
print(f"{product['name']} - {product['price']}")
print(f"Description: {product['description']}")
print(f"URL: {product['url']}\n")
Key Considerations
Legal/Ethical:
Check
robots.txt
(e.g.,https://www.example-company.com/robots.txt
)Respect
User-Agent
policiesAdd delays between requests
Selector Adjustments:
Update CSS selectors based on actual website structure
Use browser DevTools to inspect elements
Error Handling:
Add retries for failed requests
Handle missing elements gracefully
Advanced Features:
Add caching with
requests-cache
Implement pagination support
Add async support with
aiohttp
This structure provides a professional foundation you can adapt for real company websites by adjusting the selectors and URL.
Subscribe to my newsletter
Read articles from Kolluru Pawan directly inside your inbox. Subscribe to the newsletter, and don't miss out.
Written by
