#Code Generated by Parser:3ae9404d Rule: 2025-01-07 14:31:13
def get_html(url):
import time
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
page = browser.new_page()
page.goto(url)
time.sleep(10)
page_source = page.content()
browser.close()
with open("debug.html", "w", encoding="utf-8") as file:
file.write(page_source)
return page_source
from lxml.cssselect import CSSSelector
from decimal import Decimal
import re
def extract_SellPrice(lxml_tree):
selector = CSSSelector('div#corePrice_desktop span.a-offscreen')
result = selector(lxml_tree)
if result:
price = result[0].text_content().strip()
price = re.sub(r'[^\d.]', '', price)
price = re.sub(r'\.+', '.', price)
return Decimal(price)
return None
def extract_SellPrice2(lxml_tree):
selector_range = CSSSelector('div#corePrice_desktop span.a-price-range span.a-offscreen')
selector_single = CSSSelector('div#corePrice_desktop span.a-offscreen')
try:
result_range = selector_range(lxml_tree)
if result_range:
prices = [Decimal(re.sub(r'\.+', '.', re.sub(r'[^\d.]', '', price.text_content().strip()))) for price in result_range]
return max(prices)
result_single = selector_single(lxml_tree)
if result_single:
price = result_single[0].text_content().strip()
price = re.sub(r'[^\d.]', '', price)
price = re.sub(r'\.+', '.', price)
return Decimal(price)
except:
result_single = selector_single(lxml_tree)
if result_single:
price = result_single[0].text_content().strip()
price = re.sub(r'[^\d.]', '', price)
price = re.sub(r'\.+', '.', price)
return Decimal(price)
return None
def extract_ProductName(lxml_tree):
from lxml.cssselect import CSSSelector
import re
selector = CSSSelector("span#productTitle.a-size-large.product-title-word-break")
result = selector(lxml_tree)
return re.sub(r'\s+', ' ', result[0].text_content().strip()) if result else ""
def extract_TotalReview(lxml_tree):
from lxml.cssselect import CSSSelector
import re
selector = CSSSelector("a#acrCustomerReviewLink span#acrCustomerReviewText")
result = selector(lxml_tree)
if result:
match = re.search(r'[\d,]+', result[0].text_content())
return int(match.group().replace(',', '')) if match else None
return None
def extract_ProductImage(lxml_tree):
from lxml.cssselect import CSSSelector
selector = CSSSelector("img#landingImage.a-dynamic-image.a-stretch-vertical")
result = selector(lxml_tree)
return result[0].get("src") if result else ""
def extract_AverageReview(lxml_tree):
from lxml.cssselect import CSSSelector
import re
selector = CSSSelector("span#acrPopover span.a-size-base.a-color-base")
result = selector(lxml_tree)
if result:
match = re.search(r'\d+\.\d+', result[0].text_content())
return float(match.group()) if match else None
return None
if __name__ == '__main__':
import lxml.html
url='https://www.amazon.com/dp/B0D3DWTKKM/ref=sspa_dk_detail_4'
html=get_html(url)
tree = lxml.html.fromstring(html)
result={}
result['SellPrice']=extract_SellPrice(tree)
result['SellPrice2']=extract_SellPrice2(tree)
result['ProductName']=extract_ProductName(tree)
result['TotalReview']=extract_TotalReview(tree)
result['ProductImage']=extract_ProductImage(tree)
result['AverageReview']=extract_AverageReview(tree)
print(result)