SpiderSite uses the Crawl4AI library to extract content from websites, with optional AI-powered data extraction. This guide covers everything from basic scraping to advanced AI-powered extraction.
Add natural language instructions to extract specific data:
Copy
data = { "url": "https://blog.example.com/future-of-ai", "instructions": """ Extract the following information: - Article title - Author name and bio - Publication date - Article category/tags - Full article content (without ads or sidebars) - All code snippets or examples - References or citations """}response = requests.post(url, headers=headers, json=data)
data = { "url": "https://techblog.com/article", "instructions": """ Extract: - Article title - Author name - Publication date - Reading time - Main article content (exclude ads, comments, sidebars) - Tags/categories - Featured image URL """}
data = { "url": "https://shop.example.com/products/laptop-pro", "instructions": """ Extract product information: - Product name and SKU - Current price and original price (if on sale) - Discount percentage - Product description - Technical specifications (all fields) - Stock availability - Shipping information - Customer rating (average) - Number of reviews - Seller/brand name """}
data = { "url": "https://docs.example.com/api/authentication", "instructions": """ Extract from this API documentation: - Page title - API endpoint path and method - All request parameters with descriptions - Request body schema - Response schema - All code examples (preserve language tags) - Error codes and descriptions """}
data = { "url": "https://news.example.com/breaking-story", "instructions": """ Extract: - Headline - Subheadline - Author and publication - Date and time - Full article text - All quoted statements (with attribution) - Related article links - Image captions """}
Extract the following:- Product name (exact text from h1 heading)- Price (numeric value only, without currency symbol)- Product description (first paragraph only)- Specifications as a list: * CPU * RAM * Storage * Display size- Stock status (in stock / out of stock / pre-order)
import requestsimport timedef batch_scrape(urls, instructions, batch_size=10): """Submit jobs in batches to avoid rate limits""" job_ids = [] for i in range(0, len(urls), batch_size): batch = urls[i:i+batch_size] for url in batch: response = requests.post( submit_url, headers=headers, json={"url": url, "instructions": instructions} ) job_ids.append(response.json()['job_id']) # Wait between batches to respect rate limits if i + batch_size < len(urls): time.sleep(6) # 10 requests per minute return job_ids
from concurrent.futures import ThreadPoolExecutorimport requestsdef get_job_result(job_id): """Get result for a single job""" max_retries = 40 for _ in range(max_retries): response = requests.get( f"{API_BASE}/jobs/{job_id}/results", headers=headers ) if response.status_code == 200: return response.json() elif response.status_code == 202: time.sleep(3) else: return None return None# Process multiple jobs in parallelwith ThreadPoolExecutor(max_workers=5) as executor: results = list(executor.map(get_job_result, job_ids))# Filter successful resultssuccessful = [r for r in results if r and r['success']]print(f"Completed: {len(successful)}/{len(job_ids)} jobs")