from re import sub, IGNORECASE, findall from requests import Session from bs4 import BeautifulSoup import asyncio import random import logging async def scrape_flipkart_price_and_name(url): """ Scrape both Flipkart product price and name in a single request to avoid rate limiting """ # List of user agents to rotate user_agents = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0' ] session = Session() session.headers.update({ 'User-Agent': random.choice(user_agents), 'Accept-Language': 'en-US,en;q=0.9,hi;q=0.8', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Referer': 'https://www.flipkart.com/', 'Origin': 'https://www.flipkart.com', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'same-origin', 'Sec-Fetch-User': '?1', 'Cache-Control': 'no-cache', 'Pragma': 'no-cache' }) try: # Add random delay to avoid rate limiting await asyncio.sleep(random.uniform(2, 5)) # First, visit Flipkart homepage to get session cookies session.get('https://www.flipkart.com/', timeout=10) await asyncio.sleep(random.uniform(1, 2)) # Now try to get the product page data = session.get(url, timeout=15) if data.status_code == 200: soup = BeautifulSoup(data.content, 'lxml') # Extract price price_selectors = [ "div._30jeq3._16Jk6d", "div._30jeq3", "div._16Jk6d", "span._30jeq3._16Jk6d", "span._30jeq3", "div.Nx9bqj.CxhGGd", "div._30jeq3.CgYfOw.j-ONb3" ] price = None pricediv = None for selector in price_selectors: pricediv = soup.select_one(selector) if pricediv and pricediv.get_text(strip=True): break if pricediv and pricediv.get_text(strip=True): price_text = pricediv.get_text(strip=True) print(f"Found price: {price_text}") # Remove currency symbols and extract number price_clean = sub(r"[₹,$\s]", "", price_text, flags=IGNORECASE) price_clean = sub(r"[^\d.]", "", price_clean) # Keep only digits and dots if price_clean: price = float(price_clean) logging.info(f"Successfully extracted price: Rs.{price}") # Extract product name title_selectors = [ "span.VU-ZEz", # Updated selector for newer Flipkart design "span.B_NuCI", "h1.x-product-title-label", "h1._35KyD6", "span._35KyD6", "h1.yhZ71d", "span.yhZ71d", "span.VU-ZEz._36FX1L", # Another common selector "h1.Wphh3N", # Another possible title selector "[data-testid='product-title']" # Data attribute selector ] product_name = None for selector in title_selectors: title_elem = soup.select_one(selector) if title_elem and title_elem.get_text(strip=True): product_name = title_elem.get_text(strip=True) print(f"Found title with selector '{selector}': {product_name[:50]}...") break # If no product name found, try fallback to page title if not product_name: title = soup.find('title') if title: title_text = title.get_text() print(f"Using page title: {title_text}") # Remove "Buy" and "online at best price" etc. clean_title = title_text.replace("Buy ", "").split(" online")[0].split(" at best")[0] product_name = clean_title.strip() return product_name, price else: logging.error(f"Error fetching URL: {url}, status code: {data.status_code}") if data.status_code == 403: logging.error("403 Forbidden - Consider using a VPN or proxy service") return None, None except Exception as e: logging.error(f"Exception during request to {url}: {e}") return None, None finally: session.close() async def scrape_flipkart_price(url): """ Scrape Flipkart product price using the working implementation from Flipkart-Bot """ product_name, price = await scrape_flipkart_price_and_name(url) return price def extract_flipkart_product_name(url): """ Extract product name from Flipkart URL - now uses the combined async function """ import asyncio try: # Use the combined function to get both name and price in one request loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) product_name, _ = loop.run_until_complete(scrape_flipkart_price_and_name(url)) loop.close() return product_name except Exception as e: print(f"Error extracting product name: {e}") return None