Python
Python——爬取中国教育在线大学列表
import pandas as pds
import numpy
import time
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
browser=webdriver.Chrome() #驱动谷歌浏览器
#进入网站
def go_url(url):
wait=WebDriverWait(browser , 10)
try:
browser.get(url)
wait.until(
EC.presence_of_element_located((By.XPATH,'//*[@id="seachtab"]/tbody')),
)
except TimeoutException:
print('Timeout')
#定位节点——点击
def click_locatin_element(element , text):
try:
button=browser.find_element_by_xpath(element)
button.click()
except:
print(text+"不可点击")
#计算运行时间的装饰器
def my_time(func):
def wrapper():
s_time = time.time() #程序初始时间
func()
e_time = time.time()#程序运行完的时间
print('totally cost :{:.2f}'.format(e_time-s_time)+'秒') #获取总时间
return wrapper
@my_time #装饰器
def total():
item = ['id','name','loc','本科','全国热度排名','类别热度排名']
for page_num in range(1,37):
url = 'https://gkcx.eol.cn/soudaxue/queryschool.html?&schoolflag=&argschtype=%E6%99%AE%E9%80%9A%E6%9C%AC%E7%A7%91&page='+str(page_num)
#url ='https://gkcx.eol.cn/soudaxue/queryschool.html?&schoolflag=&argschtype=%E7%8B%AC%E7%AB%8B%E5%AD%A6%E9%99%A2&page='+str(page_num)
go_url(url) #进入初始网页
if (page_num == 1):
click_locatin_element('//div[@class="citybox clearfix"]//div[@province_code="42"]' ,'湖北') #点击湖北
#获取table的行数
rowcount = len(browser.find_elements_by_xpath('//*[@id="seachtab"]/tbody/tr'))
print(rowcount)
for n in range(1,rowcount+1):
element1 = '//*[@id="seachtab"]/tbody/tr['+str(n)+']/td[1]/a' #id获取
ele1 = browser.find_element_by_xpath(element1)
sch_id= ele1.get_attribute('href')
item.append(sch_id)
sch_name = ele1.text #高校名字获取
item.append(sch_name)
for i in range(2,6):
element3 = '//*[@id="seachtab"]/tbody/tr['+str(n)+']/td['+str(i)+']' #高校所在省,学历层次,全国热度排名,类别热度排名
ele3 = browser.find_element_by_xpath(element3)
sch_loct = ele3.text
item.append(sch_loct)
browser.close() #关闭浏览器
result = numpy.array(item).reshape(-1,6)
df = pds.DataFrame(result)
df.to_csv('C:/Users/Administrator/Desktop/高校本科.csv', sep=',', mode='a',index = False,header = False)
if __name__ == '__main__':
total()
#//*[@id="seachtab"]/tbody/tr[14]/td[2] //*[@id="seachtab"]/tbody/tr[14]/td[1]/span[2]
赞 (0)