批量获取IP地址来源,做分类汇总!
我们做用户分析时离不开用户来源分析,在很多网站/交易平台,我们的用户在没有注册或产生消费时,是没有用户自行上传的来源信息的。
所以我们程序猿或者数据分析师,都会根据用户访问的IP地址,来做进一步划分判断,这样便能统计到观望我们的用户分布情况。
甚至有些黑客,恶意利用国外IP访问,恶意攻击我们平台,当我们确认IP来源后,就可以事先屏蔽一些恶意攻击行为。
接口
urls = r'https://open.onebox.so.com/dataApi?callback=jQuery18309089439851148142_1533546946966&type=ip&src=onebox&tpl=0&num=1&query=ip&ip=' + str(ips) + '&url=ip' + str(int(time.time() * 1000))
案例数据源
python解析代码
# coding: utf-8
# -*- coding: utf-8 -*-
import urllib.request
import urllib.parse
from urllib.error import HTTPError ,URLError
import socket #请求超时异常
from bs4 import BeautifulSoup
import requests
import time
import json
import csv
import re
import random
import datetime
import pymysql
def response(url):
try:
headers = { }
headers['User-Agent']='Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'
#random.seed(datetime.datetime.now())
req = urllib.request.Request(url, headers=headers)
bsobj = urllib.request.urlopen(req, timeout=60).read().decode('utf-8')
bsobj = str(BeautifulSoup(bsobj,'html.parser'))
#print(bsobj)
bsobjs = re.findall(r'\{.*\}', bsobj)
bsobjs = ' '.join(bsobjs)
#print(bsobjs)
jsonobj = json.loads(bsobjs)
#print(jsonobj)
country = jsonobj.get('0')
province = jsonobj.get('1')
if (province)==None:
province = country
else:
province = province
city = jsonobj.get('2')
if (city)==None and province != None:
city = province
elif (city)==None and province == None:
city = country
else:
city = city
classification = jsonobj.get('5')
ip_id = jsonobj.get('ip')
Local = jsonobj.get('4')
print(ip_id, country ,province ,city,classification,Local)
writer.writerows([[ip_id, country ,province ,city,classification,Local]])
except (HTTPError, URLError, socket.timeout, AttributeError,UnicodeEncodeError) as e:
return
if __name__ == '__main__':
#创建存储表
file_name = r'D:\Case_data/360IP归属地查询' + '.csv'
f = open(file_name, 'w+', newline='',encoding = 'gb18030')
writer = csv.writer(f, dialect='excel')
# 先写入columns_name
writer.writerow(['ip_id', 'country' ,'province' ,'city','classification','Local'])
# 获取ip数据
with open(r'D:\Case_data/ips.csv','rt',encoding='gb18030') as csvfile:
reader = csv.reader(csvfile)
column = [row[3] for row in reader]
#print (column)
for ips in column:
#print(url)
rand = random.randint(2, 5)
time.sleep(rand)#延时提交
urls = r'https://open.onebox.so.com/dataApi?callback=jQuery18309089439851148142_1533546946966&type=ip&src=onebox&tpl=0&num=1&query=ip&ip=' + str(ips) + '&url=ip' + str(int(time.time() * 1000))
#print(urls)
response(urls)
f.close()
分类汇总
## 按照城市聚合并倒序import numpy as npimport pandas as pd
df = pd.read_csv(r'D:\Case_data/360IP归属地查询.csv',engine='python')display(df.head(5))gd = df.groupby(['province']).agg({'province':['count']})gd.columns=['数量'] gd = gd.sort_values(by=['数量'],ascending=False) # 根据某些列进行排序gd['占比']=(((gd['数量']/df['province'].count())*100).round(2).astype('str'))+'%'display(gd.head(5))