#Code Generated by Parser:d912af86 Rule: 2025-01-08 13:58:44
def get_html(url):
import time
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
page = browser.new_page()
page.goto(url)
time.sleep(10)
page_source = page.content()
browser.close()
with open("debug.html", "w", encoding="utf-8") as file:
file.write(page_source)
return page_source
from lxml.cssselect import CSSSelector
from decimal import Decimal
import re
def extract_SellPrice(lxml_tree):
sel = CSSSelector('.button-buy span.icon-shopping-bag-white')
result = sel(lxml_tree)
price_text = ''.join(result[0].text_content().strip() if result else '')
clean_price = re.sub(r'\.\.+', '.', price_text)
return Decimal(clean_price)
def extract_ProductName(lxml_tree):
from lxml.cssselect import CSSSelector
import re
selector = CSSSelector("hm-product-name h1")
result = selector(lxml_tree)
return re.sub(r'\s+', ' ', result[0].text_content()).strip() if result else None
def extract_Availability(lxml_tree):
from lxml.cssselect import CSSSelector
selector = CSSSelector("#delivery-information-wrapper")
result = selector(lxml_tree)
return bool(result)
def extract_ProductImage(lxml_tree):
from lxml.cssselect import CSSSelector
import re
selector = CSSSelector("figure.pdp-secondary-image img")
result = selector(lxml_tree)
return re.sub(r'\s+', ' ', result[0].get("src")).strip() if result else None
if __name__ == '__main__':
import lxml.html
url='https://www2.hm.com/en_hk/productpage.1267555004.html'
html=get_html(url)
tree = lxml.html.fromstring(html)
result={}
result['SellPrice']=extract_SellPrice(tree)
result['ProductName']=extract_ProductName(tree)
result['Availability']=extract_Availability(tree)
result['ProductImage']=extract_ProductImage(tree)
print(result)