This commit is contained in:
2026-02-01 14:51:55 +02:00
parent 868d4dab21
commit 55c5fd7807
3 changed files with 45 additions and 17 deletions

View File

@@ -9,8 +9,10 @@ import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import json
from urllib.request import urlopen, Request
from urllib.request import urlopen
from bs4 import BeautifulSoup
@@ -23,27 +25,30 @@ def get_car_features(element, mylist):
"""Returns the car's features as nested list"""
car_instance = element.findChild('td',attrs={"class":"make_and_model"})
car_instance = element.findChild('div',attrs={"class":"description"})
#Find ad's id
ad_link = car_instance.findChild('a')
mylist[0].append(str(ad_link.attrs).split('/')[2].strip("}").strip("'"))
ad_link = element.findChild('a',attrs={"class":"row-link"})
id_value = str(ad_link.attrs['href']).split('/')[2].strip("}").strip("'")
mylist[0].append(id_value)
#Find car description
mylist[1].append(ad_link.get_text())
mylist[1].append(car_instance.findChild('a',attrs={"class":"main"}).get_text())
#Production year
year_td = car_instance.find_next_sibling('td',attrs={'class':'year'})
year_td = car_instance.findChild('span',attrs={'class':'year'})
mylist[2].append(year_td.get_text())
#Fuel
mylist[3].append(car_instance.find_next_sibling('td',attrs={'class':'fuel'}).get_text())
mylist[3].append(car_instance.findChild('span',attrs={'class':'fuel'}).get_text())
#Transmission
mylist[4].append(car_instance.find_next_sibling('td',attrs={'class':'transmission'}).get_text())
mylist[4].append(car_instance.findChild('span',attrs={'class':'transmission'}).get_text())
#Price
mylist[5].append(car_instance.find_next_sibling('td',attrs={'class':'price'}).get_text())
price = car_instance.findChild('span',attrs={'class':'price'})
print(price)
mylist[5].append(price.get_text() if price else '')
# In[374]:
@@ -61,8 +66,8 @@ def scrape_page(url):
[], # transmission
[], # Price
]
mytable = soup.find('table',attrs={'id':'usedVehiclesSearchResult'})
result_rows = mytable.findChildren('tr',attrs={'class':re.compile(fr"result-row item-.")})
mytable = soup.find('div',attrs={'id':'usedVehiclesSearchResult-flex'})
result_rows = mytable.findChildren('div',attrs={'class':re.compile(fr"result-row item-.")})
for row in result_rows:
get_car_features(row,mylist)
df = pd.DataFrame(mylist)
@@ -90,18 +95,30 @@ import time
timeout = time.time() + 60*1
dataframes=[]
#url = "https://www.auto24.ee/kasutatud/nimekiri.php?bn=2&a=100&aj=&ssid=46274853&b=4&bw=21&ae=8&af=200&otsi=otsi"
url = "https://www.auto24.ee/kasutatud/nimekiri.php?bn=2&a=101&aj=&f1=2018&g2=12000&k1=60&ae=2&af=50&ag=0&ag=1&otsi=otsi"
# driver = webdriver.Firefox()
# driver.get(url)
hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.9,et;q=0.8',
'Connection': 'keep-alive',
# 'Cookie': 'CID=1621884104633711; my_searches_notif=1; PHPSESSID=sc66u21lot8te95edda73985tu; OptanonAlertBoxClosed=2022-01-26T10:18:23.206Z; __utma=13167336.877242740.1643192318.1643192318.1643192318.1; __utmc=13167336; __utmz=13167336.1643192318.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __cf_bm=QLTYnfz3iFPTGXscwrRdyJrkacwt8enoFU4EqQ4w33E-1644415671-0-AVspsg+4OGWPbPvqTxKtvECVzzHBpAluh0vUtjvbrznme9mZ7QRZP5mO28qeaeZ3rvWRlU8ES4oxAzZ+Z8Wy1S4=; OptanonConsent=isIABGlobal=false&datestamp=Wed+Feb+09+2022+16%3A11%3A38+GMT%2B0200+(Eastern+European+Standard+Time)&version=6.16.0&hosts=&landingPath=NotLandingPage&groups=C0004%3A1%2CC0003%3A1%2CC0002%3A1%2CC0001%3A1&AwaitingReconsent=false&geolocation=%3B',
# 'Cookie': 'CID=1621884104633711; my_searches_notif=1; PHPSESSID=sc66u21lot8te95edda73985tu; OptanonAlertBoxClosed=2022-01-26T10:18:23.206Z; __utma=13167336.877242740.1643192318.1643192318.1643192318.1; __utmc=13167336; __utmz=13167336.1643192318.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); OptanonConsent=isIABGlobal=false&datestamp=Wed+Feb+09+2022+16%3A19%3A48+GMT%2B0200+(Eastern+European+Standard+Time)&version=6.16.0&hosts=&landingPath=NotLandingPage&groups=C0004%3A1%2CC0003%3A1%2CC0002%3A1%2CC0001%3A1&AwaitingReconsent=false&geolocation=%3B; __cf_bm=cp_7820d1SeeDxv37StQH7HMg0KZgMVEHbcD_JFYmQ0-1644416588-0-AWYK6qWmiAJwRtqvbtwuBgDJU2FauySHu6Mn9R6vYiUIrvWkcgs1khFFJcpvWmFP9o9m1LkwHNRsRSvgFPTo/bY=',
'Cookie': 'CID=1621884104633711; my_searches_notif=1; PHPSESSID=sc66u21lot8te95edda73985tu; OptanonAlertBoxClosed=2022-01-26T10:18:23.206Z; __utma=13167336.877242740.1643192318.1643192318.1643192318.1; __utmc=13167336; __utmz=13167336.1643192318.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __cf_bm=cp_7820d1SeeDxv37StQH7HMg0KZgMVEHbcD_JFYmQ0-1644416588-0-AWYK6qWmiAJwRtqvbtwuBgDJU2FauySHu6Mn9R6vYiUIrvWkcgs1khFFJcpvWmFP9o9m1LkwHNRsRSvgFPTo/bY=; OptanonConsent=isIABGlobal=false&datestamp=Wed+Feb+09+2022+16%3A27%3A50+GMT%2B0200+(Eastern+European+Standard+Time)&version=6.16.0&hosts=&landingPath=NotLandingPage&groups=C0004%3A1%2CC0003%3A1%2CC0002%3A1%2CC0001%3A1&AwaitingReconsent=false&geolocation=%3B',
'Referer': 'https://www.auto24.ee/kasutatud/nimekiri.php?bn=2&a=100&b=4&bw=21&ae=8&af=50&ssid=46274853&ak=50'}
autoreq = Request(url, headers=hdr)
#nextlink = soup.find('a',attrs={'class':"input-link item","rel":"next"})
#url = "https://www.auto24.ee/" + nextlink.attrs.get('href')
has_nextpage=True
while(has_nextpage):
dataframes.append(scrape_page(url))
dataframes.append(scrape_page(autoreq))
html = urlopen(url)
html = urlopen(autoreq)
soup = BeautifulSoup(html, 'lxml')
nextlink = soup.find('a',attrs={'class':"input-link item","rel":"next"})
@@ -138,7 +155,7 @@ type(df_all.iloc[0,0])
def scrape_detailview(id_string, list_in):
url = 'https://www.auto24.ee/used/' + id_string
html = urlopen(url)
html = urlopen(Request(url, headers=hdr))
soup = BeautifulSoup(html, 'lxml')
list_in[0].append(str(id_string))
@@ -246,9 +263,9 @@ df['km'] = pd.to_numeric(df['km'])
mypattern = re.compile('(^\d{2,3} ?\d*)')
df['price']= df['price'].apply(lambda x: unicodedata.normalize("NFKD",str(x)))
df['price'] = df['price'].apply(lambda x: mypattern.findall(x)[0].replace(" ",""))
df['price']=pd.to_numeric(df['price'])
df['price'].unique()
# df['price'] = df['price'].apply(lambda x: mypattern.findall(x)[0].replace(" ",""))
#df['price']=pd.to_numeric(df['price'])
#df['price'].unique()
# In[ ]:
@@ -289,3 +306,8 @@ df[(df['year']==2009)
df[['manufacturer','model']].value_counts()
filename = 'file.json'
with open(filename, 'w') as file:
json.dump(df.to_json(), file)

1
file.json Normal file

File diff suppressed because one or more lines are too long

5
requirements.txt Normal file
View File

@@ -0,0 +1,5 @@
beautifulsoup4==4.10.0
matplotlib==3.3.4
numpy==1.19.5
pandas==1.3.0
seaborn==0.11.2