diff --git a/auto24.py b/auto24.py index fcfbd49..a6452f5 100644 --- a/auto24.py +++ b/auto24.py @@ -9,8 +9,10 @@ import numpy as np import matplotlib.pyplot as plt import seaborn as sns import re +import json + +from urllib.request import urlopen, Request -from urllib.request import urlopen from bs4 import BeautifulSoup @@ -23,27 +25,30 @@ def get_car_features(element, mylist): """Returns the car's features as nested list""" - car_instance = element.findChild('td',attrs={"class":"make_and_model"}) + car_instance = element.findChild('div',attrs={"class":"description"}) #Find ad's id - ad_link = car_instance.findChild('a') - mylist[0].append(str(ad_link.attrs).split('/')[2].strip("}").strip("'")) + ad_link = element.findChild('a',attrs={"class":"row-link"}) + id_value = str(ad_link.attrs['href']).split('/')[2].strip("}").strip("'") + mylist[0].append(id_value) #Find car description - mylist[1].append(ad_link.get_text()) + mylist[1].append(car_instance.findChild('a',attrs={"class":"main"}).get_text()) #Production year - year_td = car_instance.find_next_sibling('td',attrs={'class':'year'}) + year_td = car_instance.findChild('span',attrs={'class':'year'}) mylist[2].append(year_td.get_text()) #Fuel - mylist[3].append(car_instance.find_next_sibling('td',attrs={'class':'fuel'}).get_text()) + mylist[3].append(car_instance.findChild('span',attrs={'class':'fuel'}).get_text()) #Transmission - mylist[4].append(car_instance.find_next_sibling('td',attrs={'class':'transmission'}).get_text()) + mylist[4].append(car_instance.findChild('span',attrs={'class':'transmission'}).get_text()) #Price - mylist[5].append(car_instance.find_next_sibling('td',attrs={'class':'price'}).get_text()) + price = car_instance.findChild('span',attrs={'class':'price'}) + print(price) + mylist[5].append(price.get_text() if price else '') # In[374]: @@ -61,8 +66,8 @@ def scrape_page(url): [], # transmission [], # Price ] - mytable = soup.find('table',attrs={'id':'usedVehiclesSearchResult'}) - result_rows = mytable.findChildren('tr',attrs={'class':re.compile(fr"result-row item-.")}) + mytable = soup.find('div',attrs={'id':'usedVehiclesSearchResult-flex'}) + result_rows = mytable.findChildren('div',attrs={'class':re.compile(fr"result-row item-.")}) for row in result_rows: get_car_features(row,mylist) df = pd.DataFrame(mylist) @@ -90,18 +95,30 @@ import time timeout = time.time() + 60*1 dataframes=[] +#url = "https://www.auto24.ee/kasutatud/nimekiri.php?bn=2&a=100&aj=&ssid=46274853&b=4&bw=21&ae=8&af=200&otsi=otsi" url = "https://www.auto24.ee/kasutatud/nimekiri.php?bn=2&a=101&aj=&f1=2018&g2=12000&k1=60&ae=2&af=50&ag=0&ag=1&otsi=otsi" # driver = webdriver.Firefox() # driver.get(url) +hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', + 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', + 'Accept-Encoding': 'none', + 'Accept-Language': 'en-US,en;q=0.9,et;q=0.8', + 'Connection': 'keep-alive', + # 'Cookie': 'CID=1621884104633711; my_searches_notif=1; PHPSESSID=sc66u21lot8te95edda73985tu; OptanonAlertBoxClosed=2022-01-26T10:18:23.206Z; __utma=13167336.877242740.1643192318.1643192318.1643192318.1; __utmc=13167336; __utmz=13167336.1643192318.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __cf_bm=QLTYnfz3iFPTGXscwrRdyJrkacwt8enoFU4EqQ4w33E-1644415671-0-AVspsg+4OGWPbPvqTxKtvECVzzHBpAluh0vUtjvbrznme9mZ7QRZP5mO28qeaeZ3rvWRlU8ES4oxAzZ+Z8Wy1S4=; OptanonConsent=isIABGlobal=false&datestamp=Wed+Feb+09+2022+16%3A11%3A38+GMT%2B0200+(Eastern+European+Standard+Time)&version=6.16.0&hosts=&landingPath=NotLandingPage&groups=C0004%3A1%2CC0003%3A1%2CC0002%3A1%2CC0001%3A1&AwaitingReconsent=false&geolocation=%3B', + # 'Cookie': 'CID=1621884104633711; my_searches_notif=1; PHPSESSID=sc66u21lot8te95edda73985tu; OptanonAlertBoxClosed=2022-01-26T10:18:23.206Z; __utma=13167336.877242740.1643192318.1643192318.1643192318.1; __utmc=13167336; __utmz=13167336.1643192318.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); OptanonConsent=isIABGlobal=false&datestamp=Wed+Feb+09+2022+16%3A19%3A48+GMT%2B0200+(Eastern+European+Standard+Time)&version=6.16.0&hosts=&landingPath=NotLandingPage&groups=C0004%3A1%2CC0003%3A1%2CC0002%3A1%2CC0001%3A1&AwaitingReconsent=false&geolocation=%3B; __cf_bm=cp_7820d1SeeDxv37StQH7HMg0KZgMVEHbcD_JFYmQ0-1644416588-0-AWYK6qWmiAJwRtqvbtwuBgDJU2FauySHu6Mn9R6vYiUIrvWkcgs1khFFJcpvWmFP9o9m1LkwHNRsRSvgFPTo/bY=', + 'Cookie': 'CID=1621884104633711; my_searches_notif=1; PHPSESSID=sc66u21lot8te95edda73985tu; OptanonAlertBoxClosed=2022-01-26T10:18:23.206Z; __utma=13167336.877242740.1643192318.1643192318.1643192318.1; __utmc=13167336; __utmz=13167336.1643192318.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __cf_bm=cp_7820d1SeeDxv37StQH7HMg0KZgMVEHbcD_JFYmQ0-1644416588-0-AWYK6qWmiAJwRtqvbtwuBgDJU2FauySHu6Mn9R6vYiUIrvWkcgs1khFFJcpvWmFP9o9m1LkwHNRsRSvgFPTo/bY=; OptanonConsent=isIABGlobal=false&datestamp=Wed+Feb+09+2022+16%3A27%3A50+GMT%2B0200+(Eastern+European+Standard+Time)&version=6.16.0&hosts=&landingPath=NotLandingPage&groups=C0004%3A1%2CC0003%3A1%2CC0002%3A1%2CC0001%3A1&AwaitingReconsent=false&geolocation=%3B', + 'Referer': 'https://www.auto24.ee/kasutatud/nimekiri.php?bn=2&a=100&b=4&bw=21&ae=8&af=50&ssid=46274853&ak=50'} +autoreq = Request(url, headers=hdr) #nextlink = soup.find('a',attrs={'class':"input-link item","rel":"next"}) #url = "https://www.auto24.ee/" + nextlink.attrs.get('href') has_nextpage=True while(has_nextpage): - dataframes.append(scrape_page(url)) + dataframes.append(scrape_page(autoreq)) - html = urlopen(url) + html = urlopen(autoreq) soup = BeautifulSoup(html, 'lxml') nextlink = soup.find('a',attrs={'class':"input-link item","rel":"next"}) @@ -138,7 +155,7 @@ type(df_all.iloc[0,0]) def scrape_detailview(id_string, list_in): url = 'https://www.auto24.ee/used/' + id_string - html = urlopen(url) + html = urlopen(Request(url, headers=hdr)) soup = BeautifulSoup(html, 'lxml') list_in[0].append(str(id_string)) @@ -246,9 +263,9 @@ df['km'] = pd.to_numeric(df['km']) mypattern = re.compile('(^\d{2,3} ?\d*)') df['price']= df['price'].apply(lambda x: unicodedata.normalize("NFKD",str(x))) -df['price'] = df['price'].apply(lambda x: mypattern.findall(x)[0].replace(" ","")) -df['price']=pd.to_numeric(df['price']) -df['price'].unique() +# df['price'] = df['price'].apply(lambda x: mypattern.findall(x)[0].replace(" ","")) +#df['price']=pd.to_numeric(df['price']) +#df['price'].unique() # In[ ]: @@ -289,3 +306,8 @@ df[(df['year']==2009) df[['manufacturer','model']].value_counts() + +filename = 'file.json' + +with open(filename, 'w') as file: + json.dump(df.to_json(), file) diff --git a/file.json b/file.json new file mode 100644 index 0000000..5975af4 --- /dev/null +++ b/file.json @@ -0,0 +1 @@ +"{\"id\":{\"13\":3638295,\"8\":3641165,\"18\":3649448,\"5\":3522393,\"22\":3592065,\"10\":3606534,\"34\":3641596,\"35\":3638291,\"6\":3615086,\"11\":3649358,\"12\":3572898,\"23\":3538158,\"2\":3602490,\"24\":3644248,\"29\":3613579,\"17\":3645014,\"30\":3631631,\"25\":3562253,\"14\":3606192,\"20\":3587898,\"26\":3636184,\"4\":3615654,\"31\":3645052,\"7\":3609319,\"1\":3608266,\"15\":3649098,\"32\":3646408,\"28\":3609731,\"33\":3563152,\"9\":3590171,\"27\":3641901,\"16\":3592612,\"36\":3648641,\"19\":3648048,\"37\":3622970,\"21\":3647949,\"0\":3644365,\"3\":3596594},\"body\":{\"13\":\"luukp\\u00e4ra\",\"8\":\"luukp\\u00e4ra\",\"18\":\"luukp\\u00e4ra\",\"5\":\"universaal\",\"22\":\"luukp\\u00e4ra\",\"10\":\"mahtuniversaal\",\"34\":\"universaal\",\"35\":\"universaal\",\"6\":\"universaal\",\"11\":\"luukp\\u00e4ra\",\"12\":\"luukp\\u00e4ra\",\"23\":\"luukp\\u00e4ra\",\"2\":\"luukp\\u00e4ra\",\"24\":\"universaal\",\"29\":\"luukp\\u00e4ra\",\"17\":\"universaal\",\"30\":\"universaal\",\"25\":\"universaal\",\"14\":\"luukp\\u00e4ra\",\"20\":\"universaal\",\"26\":\"universaal\",\"4\":\"universaal\",\"31\":\"luukp\\u00e4ra\",\"7\":\"luukp\\u00e4ra\",\"1\":\"universaal\",\"15\":\"universaal\",\"32\":\"universaal\",\"28\":\"universaal\",\"33\":\"sedaan\",\"9\":\"luukp\\u00e4ra\",\"27\":\"sedaan\",\"16\":\"universaal\",\"36\":\"sedaan\",\"19\":\"luukp\\u00e4ra\",\"37\":\"mahtuniversaal\",\"21\":\"luukp\\u00e4ra\",\"0\":\"mahtuniversaal\",\"3\":\"mahtuniversaal\"},\"man_date\":{\"13\":\"01\\/2019\",\"8\":\"06\\/2019\",\"18\":\"10\\/2019\",\"5\":\"06\\/2018\",\"22\":\"08\\/2018\",\"10\":\"11\\/2018\",\"34\":\"04\\/2018\",\"35\":\"06\\/2018\",\"6\":\"02\\/2018\",\"11\":\"01\\/2018\",\"12\":\"10\\/2018\",\"23\":\"07\\/2018\",\"2\":\"06\\/2018\",\"24\":\"07\\/2018\",\"29\":\"07\\/2019\",\"17\":\"01\\/2018\",\"30\":\"08\\/2018\",\"25\":\"03\\/2019\",\"14\":\"06\\/2018\",\"20\":\"08\\/2018\",\"26\":\"01\\/2018\",\"4\":\"04\\/2018\",\"31\":\"02\\/2018\",\"7\":\"08\\/2018\",\"1\":\"02\\/2018\",\"15\":\"01\\/2019\",\"32\":\"01\\/2018\",\"28\":\"04\\/2018\",\"33\":\"03\\/2019\",\"9\":\"05\\/2018\",\"27\":\"05\\/2018\",\"16\":\"04\\/2018\",\"36\":\"05\\/2019\",\"19\":\"05\\/2019\",\"37\":\"11\\/2021\",\"21\":\"05\\/2018\",\"0\":\"2018\",\"3\":\"02\\/2018\"},\"km\":{\"13\":60256,\"8\":12877,\"18\":36157,\"5\":133478,\"22\":56380,\"10\":25100,\"34\":87000,\"35\":29600,\"6\":48000,\"11\":53000,\"12\":41107,\"23\":29561,\"2\":33900,\"24\":66000,\"29\":19900,\"17\":144531,\"30\":31735,\"25\":99417,\"14\":45028,\"20\":46115,\"26\":58790,\"4\":151166,\"31\":180582,\"7\":33000,\"1\":142615,\"15\":179702,\"32\":107000,\"28\":198000,\"33\":120000,\"9\":182110,\"27\":122458,\"16\":237000,\"36\":44000,\"19\":29765,\"37\":10,\"21\":48150,\"0\":320000,\"3\":303000},\"description\":{\"13\":\"Citroen C3 BlueHDi 100 FeelC3 1.5 75kW\",\"8\":\"Citroen C3 PureTech 82 FeelC3 1.2 61kW\",\"18\":\"Citroen C3C3 1.5 75kW\",\"5\":\"Citroen C4 CactusC4 Cactus 1.2 60kW\",\"22\":\"Citroen C4 CactusC4 Cactus 1.2 HM01 81kW\",\"10\":\"Dacia DokkerDokker 1.6 75kW\",\"34\":\"Dacia DusterDuster 1.5 80kW\",\"35\":\"Dacia DusterDuster 1.6 84kW\",\"6\":\"Dacia Logan MCV Laureate Easy-RLogan MCV 0.9 Tce 66kW\",\"11\":\"Dacia SanderoSandero 0.9 66kW\",\"12\":\"Ford FiestaFiesta 1.5 63kW\",\"23\":\"Hyundai KonaKona 1.0 88kW\",\"2\":\"Hyundai i20i20 1.2 62kW\",\"24\":\"Kia cee'd Sportswagon LXcee'd Sportswagon 1.6 99kW\",\"29\":\"Nissan Micra AcentaMicra 0.9 66kW\",\"17\":\"Opel Astra BusinessAstra 1.6 81kW\",\"30\":\"Opel Astra Sports Tourer BusinessAstra 1.0 EcoTec 77kW\",\"25\":\"Opel Astra Sports Tourer BusinessAstra 1.6 CDTI 81kW\",\"14\":\"Opel AstraAstra 1.4 74kW\",\"20\":\"Peugeot 2008 Active2008 1.2 61kW\",\"26\":\"Peugeot 20082008 1.2 60kW\",\"4\":\"Peugeot 308308 1.2 81kW\",\"31\":\"SEAT LeonLeon 1.6 85kW\",\"7\":\"Skoda Fabia AmbitionFabia 1.0 70kW\",\"1\":\"Skoda Fabia Combi Ambition DriveFabia 1.0 81kW\",\"15\":\"Skoda Fabia Elegance Combi FLFabia 1.0 TSI 81kW\",\"32\":\"Skoda Fabia Monte CarloFabia 1.0 81kW\",\"28\":\"Skoda Octavia FaceliftOctavia 1.6 85kW\",\"33\":\"Skoda OctaviaOctavia 1.0 85kW\",\"9\":\"Skoda OctaviaOctavia 1.4 81kW\",\"27\":\"Skoda OctaviaOctavia 1.5 TSI 110kW\",\"16\":\"Skoda OctaviaOctavia 1.6 tdi 85kW\",\"36\":\"Skoda Rapid FL ActiveRapid 1.0 70kW\",\"19\":\"Skoda Rapid Sedan FLRapid 1.0 TSI 70kW\",\"37\":\"Skywell ET5ET5 100kW\",\"21\":\"Toyota Yaris ActiveYaris 1.5 82kW\",\"0\":\"Volkswagen Caddy MAXI INVACaddy 2.0 110kW\",\"3\":\"Volkswagen Touran Comfortline TDITouran 2.0 81kW\"},\"year\":{\"13\":2019,\"8\":2019,\"18\":2019,\"5\":2018,\"22\":2018,\"10\":2018,\"34\":2018,\"35\":2018,\"6\":2018,\"11\":2018,\"12\":2018,\"23\":2018,\"2\":2018,\"24\":2018,\"29\":2019,\"17\":2018,\"30\":2018,\"25\":2019,\"14\":2018,\"20\":2018,\"26\":2018,\"4\":2018,\"31\":2018,\"7\":2018,\"1\":2018,\"15\":2019,\"32\":2018,\"28\":2018,\"33\":2019,\"9\":2018,\"27\":2018,\"16\":2018,\"36\":2019,\"19\":2019,\"37\":2021,\"21\":2018,\"0\":2018,\"3\":2018},\"fuel\":{\"13\":\"Diisel\",\"8\":\"Bensiin\",\"18\":\"Diisel\",\"5\":\"Bensiin\",\"22\":\"Bensiin\",\"10\":\"Bensiin\",\"34\":\"Diisel\",\"35\":\"Bensiin\",\"6\":\"Bensiin\",\"11\":\"Bensiin\",\"12\":\"Diisel\",\"23\":\"Bensiin\",\"2\":\"Bensiin\",\"24\":\"Bensiin\",\"29\":\"Bensiin\",\"17\":\"Diisel\",\"30\":\"Bensiin\",\"25\":\"Diisel\",\"14\":\"Bensiin\",\"20\":\"Bensiin\",\"26\":\"Bensiin\",\"4\":\"Bensiin\",\"31\":\"Diisel\",\"7\":\"Bensiin\",\"1\":\"Bensiin\",\"15\":\"Bensiin\",\"32\":\"Bensiin\",\"28\":\"Diisel\",\"33\":\"Bensiin\",\"9\":\"Bensiin + gaas (CNG\\/surugaas)\",\"27\":\"Bensiin\",\"16\":\"Diisel\",\"36\":\"Bensiin\",\"19\":\"Bensiin\",\"37\":\"Elekter\",\"21\":\"Bensiin\",\"0\":\"Diisel\",\"3\":\"Diisel\"},\"gearbox\":{\"13\":\"Manuaal\",\"8\":\"Manuaal\",\"18\":\"Manuaal\",\"5\":\"Manuaal\",\"22\":\"Manuaal\",\"10\":\"Manuaal\",\"34\":\"Manuaal\",\"35\":\"Manuaal\",\"6\":\"Automaat\",\"11\":\"Manuaal\",\"12\":\"Manuaal\",\"23\":\"Manuaal\",\"2\":\"Manuaal\",\"24\":\"Manuaal\",\"29\":\"Manuaal\",\"17\":\"Manuaal\",\"30\":\"Manuaal\",\"25\":\"Manuaal\",\"14\":\"Manuaal\",\"20\":\"Manuaal\",\"26\":\"Manuaal\",\"4\":\"Manuaal\",\"31\":\"Manuaal\",\"7\":\"Manuaal\",\"1\":\"Manuaal\",\"15\":\"Automaat\",\"32\":\"Manuaal\",\"28\":\"Automaat\",\"33\":\"Manuaal\",\"9\":\"Automaat\",\"27\":\"Manuaal\",\"16\":\"Manuaal\",\"36\":\"Manuaal\",\"19\":\"Manuaal\",\"37\":\"Automaat\",\"21\":\"Automaat\",\"0\":\"Automaat\",\"3\":\"Automaat\"},\"price\":{\"13\":\"10 900 \\u20ac\",\"8\":\"9900 \\u20ac\",\"18\":\"10 995 \\u20ac\",\"5\":\"9490 \\u20ac\",\"22\":\"11 500 \\u20ac\",\"10\":\"10 400 \\u20ac\",\"34\":\"11 999 \\u20ac\",\"35\":\"11 999 \\u20ac\",\"6\":\"9490 \\u20ac\",\"11\":\"10 499 \\u20ac\",\"12\":\"10 590 \\u20ac\",\"23\":\"11 500 \\u20ac\",\"2\":\"8990 \\u20ac\",\"24\":\"11 900 \\u20ac\",\"29\":\"11 990 \\u20ac\",\"17\":\"10 990 \\u20ac\",\"30\":\"11 990 \\u20ac\",\"25\":\"11 900 \\u20ac\",\"14\":\"10 900 \\u20ac\",\"20\":\"11 390 \\u20ac\",\"26\":\"11 900 \\u20ac\",\"4\":\"9300 \\u20ac\",\"31\":\"11 990 \\u20ac\",\"7\":\"9600 \\u20ac\",\"1\":\"8700 \\u20ac\",\"15\":\"10 900 \\u20ac\",\"32\":\"11 990 \\u20ac\",\"28\":\"11 900 \\u20ac\",\"33\":\"11 990 \\u20ac\",\"9\":\"10 100 \\u20ac\",\"27\":\"11 900 \\u20ac\",\"16\":\"10 950 \\u20ac\",\"36\":\"12 000 \\u20ac\",\"19\":\"11 200 \\u20ac\",\"37\":\"\",\"21\":\"11 495 \\u20ac\",\"0\":\"7500 \\u20ac\",\"3\":\"8999 \\u20ac\"},\"model\":{\"13\":\"C3\",\"8\":\"C3\",\"18\":\"C3C3\",\"5\":\"C4\",\"22\":\"C4\",\"10\":\"DokkerDokker\",\"34\":\"DusterDuster\",\"35\":\"DusterDuster\",\"6\":\"Logan\",\"11\":\"SanderoSandero\",\"12\":\"FiestaFiesta\",\"23\":\"KonaKona\",\"2\":\"i20i20\",\"24\":\"cee'd\",\"29\":\"Micra\",\"17\":\"Astra\",\"30\":\"Astra\",\"25\":\"Astra\",\"14\":\"AstraAstra\",\"20\":\"2008\",\"26\":\"20082008\",\"4\":\"308308\",\"31\":\"LeonLeon\",\"7\":\"Fabia\",\"1\":\"Fabia\",\"15\":\"Fabia\",\"32\":\"Fabia\",\"28\":\"Octavia\",\"33\":\"OctaviaOctavia\",\"9\":\"OctaviaOctavia\",\"27\":\"OctaviaOctavia\",\"16\":\"OctaviaOctavia\",\"36\":\"Rapid\",\"19\":\"Rapid\",\"37\":\"ET5ET5\",\"21\":\"Yaris\",\"0\":\"Caddy\",\"3\":\"Touran\"},\"manufacturer\":{\"13\":\"Citroen\",\"8\":\"Citroen\",\"18\":\"Citroen\",\"5\":\"Citroen\",\"22\":\"Citroen\",\"10\":\"Dacia\",\"34\":\"Dacia\",\"35\":\"Dacia\",\"6\":\"Dacia\",\"11\":\"Dacia\",\"12\":\"Ford\",\"23\":\"Hyundai\",\"2\":\"Hyundai\",\"24\":\"Kia\",\"29\":\"Nissan\",\"17\":\"Opel\",\"30\":\"Opel\",\"25\":\"Opel\",\"14\":\"Opel\",\"20\":\"Peugeot\",\"26\":\"Peugeot\",\"4\":\"Peugeot\",\"31\":\"SEAT\",\"7\":\"Skoda\",\"1\":\"Skoda\",\"15\":\"Skoda\",\"32\":\"Skoda\",\"28\":\"Skoda\",\"33\":\"Skoda\",\"9\":\"Skoda\",\"27\":\"Skoda\",\"16\":\"Skoda\",\"36\":\"Skoda\",\"19\":\"Skoda\",\"37\":\"Skywell\",\"21\":\"Toyota\",\"0\":\"Volkswagen\",\"3\":\"Volkswagen\"}}" \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..67126d2 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +beautifulsoup4==4.10.0 +matplotlib==3.3.4 +numpy==1.19.5 +pandas==1.3.0 +seaborn==0.11.2