91一区在线,思热99re视热频这里只精品,国产亚洲一二三区

主頁 > 知識庫 > python爬蟲之教你如何爬取地理數據

python爬蟲之教你如何爬取地理數據

一、shapely模塊

1、shapely

shapely是python中開源的針對空間幾何進行處理的模塊，支持點、線、面等基本幾何對象類型以及相關空間操作。

2、point→Point類

curve→LineString和LinearRing類；
surface→Polygon類
集合方法分別對應MultiPoint、MultiLineString、MultiPolygon

3、導入所需模塊

# 導入所需模塊
from shapely import geometry as geo
from shapely import wkt
from shapely import ops
import numpy as np
from shapely.geometry.polygon import LinearRing
from shapely.geometry import Polygon
from shapely.geometry import asPoint, asLineString, asMultiPoint, asPolygon

4、Point

（1）、創建point,主要有以下三種方法

# 創建point
pt1 = geo.Point([0,0])
coord = np.array([0,1])
pt2 = geo.Point(coord)
pt3 = wkt.loads("POINT(1 1)")
geo.GeometryCollection([pt1, pt2, pt3]) #批量可視化

最終三個點的結果如下所示：

（2）、point常用屬性

# point常用屬性
print(pt1.x) #pt1的x坐標
print(pt1.y)#pt1的y坐標
print(list(pt1.coords)) 
print(np.array(pt1))

輸出結果如下：

0.0
0.0
[(0.0, 0.0)]
[0. 0.]

（3）、point常用方法，計算距離

# point計算距離
d = pt2.distance(pt1) #計算pt1與pt2的距離, d =1.0

5、LineString

創建LineString主要有以下三種方法：

# LineString的創建
line1 = geo.LineString([(0,0),(1,-0.1),(2,0.1),(3,-0.1),(5,0.1),(7,0)])
arr = np.array([(2, 2), (3, 2), (4, 3)])
line2 = geo.LineString(arr)
line3 = wkt.loads("LineString(-2 -2,4 4)")

line1, line2, line3對應的直線如下所示

LineString常用方法：

print(line2.length) #計算線段長度：2.414213562373095
print(list(line2.coords)) #線段中點的坐標：[(2.0, 2.0), (3.0, 2.0), (4.0, 3.0)]
print(np.array(line2)) #將點坐標轉成numpy.array形式[[2. 2.],[3. 2.],[4. 3.]]
print(line2.bounds)#坐標范圍:(2.0, 2.0, 4.0, 3.0)
center = line2.centroid #幾何中心:
geo.GeometryCollection([line2, center])
bbox = line2.envelope #最小外接矩形
geo.GeometryCollection([line2, bbox])

rect = line2.minimum_rotated_rectangle #最小旋轉外接矩形
geo.GeometryCollection([line2, rect])

line2幾何中心：

line2的最小外接矩形：

line2的最小旋轉外接矩形：

#常用方法
d1 = line1.distance(line2) #線線距離： 1.9
d2 = line1.distance(geo.Point([-1, 0])) #點線距離：1.0
d3 = line1.hausdorff_distance(line2) #最大最小距離：4.242640687119285
#插值
pt_half = line1.interpolate(0.5, normalized = True)
geo.GeometryCollection([line1,pt_half])

#投影
ratio = line1.project(pt_half, normalized = True)
print(ratio)#project()方法是和interpolate方法互逆的:0.5

插值：

DouglasPucker算法:道格拉斯-普克算法:是將曲線近似表示為一系列點，并減少點的數量的一種算法。

#DouglasPucker算法
line1 = geo.LineString([(0, 0), (1, -0.2), (2, 0.3), (3, -0.5), (5, 0.2), (7,0)])
line1_simplify = line1.simplify(0.4, preserve_topology=False)
print(line1)#LINESTRING (0 0, 1 -0.1, 2 0.1, 3 -0.1, 5 0.1, 7 0)
print(line1_simplify)#LINESTRING (0 0, 2 0.3, 3 -0.5, 5 0.2, 7 0)
buffer_with_circle = line1.buffer(0.2) #端點按照半圓擴展
geo.GeometryCollection([line1,buffer_with_circle])

道格拉斯-普克算法化簡后的結果

6、LineRing:（是一個封閉圖形）

#LinearRing是一個封閉圖形
ring = LinearRing([(0, 0), (1, 1), (1, 0)])
print(ring.length)#相比于剛才的LineString的代碼示例，其長度現在是3.41，是因為其序列是閉合的
print(ring.area)：結果為0
geo.GeometryCollection([ring])

7、Polygon:(多邊形）

polygonl = Polygon([(0, 0), (1, 1), (1, 0)])
ext = [(0, 0), (0, 2), (2, 2), (2, 0), (0, 0)]
int1 = [(1, 0), (0.5, 0.5), (1, 1), (1.5, 0.5), (1, 0)]
polygon2 = Polygon(ext, [int1])
print(polygonl.area)#幾何對象的面積：0.5
print(polygonl.length)#幾何對象的周長：3.414213562373095
print(polygon2.area)#其面積是ext的面積減去int的面積：3.5
print(polygon2.length)#其長度是ext的長度加上int的長度：10.82842712474619
print(np.array(polygon2.exterior)) #外圍坐標點：
#[[0. 0.]
 #[0. 2.]
 #[2. 2.]
 #[2. 0.]
# [0. 0.]]
geo.GeometryCollection([polygon2])

8、幾何對象的關系：內部、邊界與外部

#obj.contains(other) == other.within(obj)
coords = [(0, 0), (1, 1)]
print(geo.LineString(coords).contains(geo.Point(0.5, 0.5)))#包含：True

print(geo.LineString(coords).contains(geo.Point(1, 1)))#False
polygon1 = Polygon([(0, 0), (0, 2), (2, 2), (2, 0), (0, 0)])
print(polygon1.contains(geo.LineString([(1.0, 1.0), (1.0, 0)])))#面與線關系:True
#contains方法也可以擴展到面與線的關系以及面與面的關系
geo.GeometryCollection([polygon1, geo.LineString([(1.0, 1.0), (1.0, 0)])])

#obj.crosses(other):相交與否
print(geo.LineString(coords).crosses(geo.LineString([(0, 1), (1, 0)])))#：True
geo.GeometryCollection([geo.LineString(coords), geo.LineString([(0, 1), (1, 0)])])
#obj.disjoint(other):均不相交返回True
print(geo.Point(0, 0).disjoint(geo.Point(1, 1)))
#object.intersects(other)如果該幾何對象與另一個幾何對象只要相交則返回True。
print(geo.LineString(coords).intersects(geo.LineString([(0, 1), (1, 0)])))#True

#object.convex_hull返回包含對象中所有點的最小凸多邊形（凸包）
points1 = geo.MultiPoint([(0, 0), (1, 1), (0, 2), (2, 2), (3, 1), (1, 0)])
hull1 = points1.convex_hull
geo.GeometryCollection([hull1, points1])

#object.intersection  返回對象與對象之間的交集
polygon1 = Polygon([(0, 0), (0, 2), (2, 2), (2, 0), (0, 0)])
hull1.intersection(polygon1)

#返回對象與對象之間的并集
hull1.union(polygon1)

#面面補集
hull1.difference(polygon1)

9、point、LineRing、LineString與numpy中的array互相轉換

pa = asPoint(np.array([0, 0])) #將numpy轉成point格式

 #將numpy數組轉成LineString格式
la = asLineString(np.array(([[1.0, 2.0], [3.0, 4.0]])))

#將numpy數組轉成multipoint集合
ma = asMultiPoint(np.array([[1.1, 2.2], [3.3, 4.4], [5.5, 6.6]]))

#將numpy轉成多邊形
pg = asPolygon(np.array([[1.1, 2.2], [3.3, 4.4], [5.5, 6.6]]))

二、geopandas模塊

geopandas拓展了pandas，共有兩種數據類型:GeoSeries、GeoDataFrame

下述是利用geopandas庫繪制世界地圖：

import pandas as pd
import geopandas 
import matplotlib.pyplot as plt
world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres')) #read_file方法可以讀取shape文件
world.plot()
plt.show()

world.head()

#根據每一個polygon的pop_est不同，便可以用python繪制圖表顯示不同國家的人數
fig, ax = plt.subplots(figsize = (9, 6), dpi = 100)
world.plot('pop_est', ax = ax, legend =True)
plt.show()

python對海洋數據進行預處理操作（這里我發現，tqdm模塊可以顯示進度條，感覺很高端，像下面這樣）

1、導入模塊

```python
import pandas as pd
import geopandas as gpd
from pyproj import Proj #左邊轉換
from keplergl import KeplerGl
from tqdm import tqdm
import os
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import shapely
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
plt.rcParams['font.sans-serif'] = ['SimSun'] #指定默認字體為新宋體
plt.rcParams['axes.unicode_minus'] = False

DataFrame獲取數據，坐標轉換，計算距離

#獲取文件夾中的數據
def get_data(file_path, model):
    assert model in ['train', 'test'], '{} Not Support this type of file'.format(model)
    paths = os.listdir(file_path)
    tmp = []
    for t in tqdm(range(len(paths))):
        p = paths[t]
        with open('{}/{}'.format(file_path, p), encoding = 'utf-8') as f:
            next(f) #讀取下一行
            for line in f.readlines():
                tmp.append(line.strip().split(','))
    tmp_df = pd.DataFrame(tmp)
    if model == 'train':
        tmp_df.columns = ['ID', 'lat', 'lon', 'speed', 'direction', 'time', 'type']
    else:
        tmp_df['type'] = 'unknown'
        tmp_df.columns = ['ID', 'lat', 'lon', 'speed', 'direction', 'time', 'type']
    tmp_df['lat'] = tmp_df['lat'].astype(float)
    tmp_df['lon'] = tmp_df['lon'].astype(float)
    tmp_df['speed'] = tmp_df['speed'].astype(float)
    tmp_df['direction'] = tmp_df['direction'].astype(int)
    return tmp_df
file_path = r"C:\Users\李\Desktop\datawheal\數據\hy_round1_train_20200102"
model = 'train'
#平面坐標轉經緯度
def transform_xy2lonlat(df):
    x = df['lat'].values
    y = df['lon'].values
    p = Proj('+proj=lcc +lat_1=33.88333333333333 +lat_2=32.78333333333333 +lat_0=32.16666666666666 +lon_0=-116.25 +x_0=2000000.0001016 +y_0=500000.0001016001 +datum=NAD83 +units=us-ft +no_defs ')
    df['lon'], df['lat'] = p(y, x, inverse = True)
    return df
#修改數據的時間格式
def reformat_strtime(time_str = None, START_YEAR = '2019'):
     time_str_split = time_str.split(" ") #以空格為分隔符
     time_str_reformat = START_YEAR + '-' + time_str_split[0][:2] + "-" + time_str_split[0][2:4]
     time_str_reformat = time_str_reformat + " " + time_str_split[1]
     return time_str_reformat
 
#計算兩個點的距離
def haversine_np(lon1, lat1, lon2, lat2):
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    km = 6367 * c
    return km * 1000

利用3-sigma算法對異常值進行處理，速度與時間

#計算時間的差值
def compute_traj_diff_time_distance(traj = None):
    #計算時間的差值
    time_diff_array = (traj['time'].iloc[1:].reset_index(drop = True) - traj['time'].iloc[:-1].reset_index(drop = True)).dt.total_seconds() / 60
    #計算坐標之間的距離
    dist_diff_array = haversine_np(traj['lon'].values[1:],
                                   traj['lat'].values[1:],
                                   traj['lon'].values[:-1],
                                   traj['lat'].values[:-1])
    #填充第一個值
    time_diff_array = [time_diff_array.mean()] + time_diff_array.tolist()
    dist_diff_array = [dist_diff_array.mean()] + dist_diff_array.tolist()
    traj.loc[list(traj.index), 'time_array'] = time_diff_array
    traj.loc[list(traj.index), 'dist_array'] = dist_diff_array
    return traj
#對軌跡進行異常點的剔除
def assign_traj_anomaly_points_nan(traj = None, speed_maximum = 23,time_interval_maximum = 200, coord_speed_maximum = 700):
    #將traj中的異常點分配給np.nan
    def thigma_data(data_y, n):
        data_x = [i for i in range(len(data_y))]
        ymean = np.mean(data_y)
        ystd = np.std(data_y)
        threshold1 = ymean - n * ystd
        threshold2 = ymean + n * ystd
        judge = []
        for data in data_y:
            if data  threshold1 or data > threshold2:
                judge.append(True)
            else:
                judge.append(False)
        return judge
    #異常速度修改
    is_speed_anomaly = (traj['speed'] > speed_maximum) | (traj['speed']  0)
    traj['speed'][is_speed_anomaly] = np.nan
    #根據距離和時間計算速度
    is_anomaly = np.array([False] * len(traj))
    traj['coord_speed'] = traj['dist_array'] / traj['time_array']
    #根據3-sigma算法對速度剔除以及較大的時間間隔點
    is_anomaly_tmp = pd.Series(thigma_data(traj['time_array'], 3)) | pd.Series(thigma_data(traj['coord_speed'], 3))
    is_anomaly = is_anomaly | is_anomaly_tmp
    is_anomaly.index = traj.index
    #軌跡點的3-sigma異常處理
    traj = traj[~is_anomaly].reset_index(drop = True)
    is_anomaly = np.array([False]*len(traj))
    if len(traj) != 0:
        lon_std, lon_mean = traj['lon'].std(), traj['lon'].mean()
        lat_std, lat_mean = traj['lat'].std(), traj['lat'].mean()
        lon_low, lon_high = lon_mean - 3* lon_std, lon_mean + 3 * lon_std
        lat_low, lat_high = lat_mean - 3 * lat_std, lat_mean + 3 * lat_std
        is_anomaly = is_anomaly | (traj['lon'] > lon_high) | ((traj['lon']  lon_low))
        is_anomaly = is_anomaly | (traj["lat"] > lat_high) | ((traj["lat"]  lat_low))
        traj = traj[~is_anomaly].reset_index(drop = True)
    return traj, [len(is_speed_anomaly) - len(traj)]

file_path = r"C:\Users\李\Desktop\datawheal\數據\hy_round1_train_20200102"
model = 'train'
df = get_data(file_path, model)
#轉換時間格式
df = transform_xy2lonlat(df)
df['time'] = df['time'].apply(reformat_strtime)
df['time'] = df['time'].apply(lambda x: datetime.strptime(x,'%Y-%m-%d %H:%M:%S'))
#對軌跡的異常點進行剔除，對缺失值進行線性插值處理
ID_list = list(pd.DataFrame(df['ID'].value_counts()).index)
DF_NEW = []
Anomaly_count = []
for ID in tqdm(ID_list):
    # print(ID)
    df_id = compute_traj_diff_time_distance(df[df['ID'] == ID])
    df_new, count = assign_traj_anomaly_points_nan(df_id)
    df_new['speed'] = df_new['speed'].interpolate(method = 'linear', axis = 0)
    df_new = df_new.fillna(method = 'bfill') #用前一個非缺失值取填充該缺失值
    df_new = df_new.fillna(method = 'ffill')#用后一個非缺失值取填充該缺失值
    df_new['speed'] = df_new['speed'].clip(0, 23) #clip()函數將其限定在0，23
    Anomaly_count.append(count) #統計每個id異常點的數量有多少
    DF_NEW.append(df_new)
DF = pd.concat(DF_NEW)

處理后的DF

利用Geopandas中的Simplify進行軌跡簡化和壓縮

#道格拉斯-普克，由該案例可以看出針對相同的ID軌跡，可以先用geopandas將其進行簡化和數據壓縮
line = shapely.geometry.LineString(np.array(df[df['ID'] == '11'][['lon', 'lat']]))
ax = gpd.GeoSeries([line]).plot(color = 'red')
ax = gpd.GeoSeries([line]).simplify(tolerance = 0.000000001).plot(color = 'blue', ax = ax, linestyle = '--')
LegendElement = [Line2D([], [], color = 'red', label = '簡化前'),
                 Line2D([], [], color = 'blue', linestyle = '--', label = '簡化后')]
#將制作好的圖例影響對象列表導入legend()中
ax.legend(handles = LegendElement, loc = 'upper left', fontsize = 10)
print('化簡前數據長度：' + str(len(np.array(gpd.GeoSeries([line])[0]))))
print('化簡后數據長度' + str(len(np.array(gpd.GeoSeries([line]).simplify(tolerance = 0.000000001)[0]))))
#定義數據簡化函數，通過shapely庫將經緯度轉換成LineString格式，然后通過GeoSeries數據結構中利用simplify進行簡化，再將所有數據放入GeoDataFrame
def simplify_dataframe(df):
    line_list = []
    for i in tqdm(dict(list(df.groupby('ID')))):
        line_dict = {}
        lat_lon = dict(list(df.groupby('ID')))[i][['lon', 'lat']]
        line = shapely.geometry.LineString(np.array(lat_lon))
        line_dict['ID'] = dict(list(df.groupby('ID')))[i].iloc[0]['ID']
        line_dict['type'] = dict(list(df.groupby('ID')))[i].iloc[0]['type']
        line_dict['geometry'] = gpd.GeoSeries([line]).simplify(tolerance = 0.000000001)[0]
        line_list.append(line_dict)
    return gpd.GeoDataframe(line_list)

化簡前數據長度：377
化簡后數據長度156

這塊的df_gpd_change沒有讀出來，后續再發

df_gpd_change=pd.read_pickle(r"C:\Users\李\Desktop\datawheal\數據\df_gpd_change.pkl")        
map1=KeplerGl(height=800)#zoom_start與這個height類似，表示地圖的縮放程度
map1.add_data(data=df_gpd_change,name='data')
#當運行該代碼后，下面會有一個kepler.gl使用說明的鏈接，可以根據該鏈接進行學習參

GeoHash編碼：利用二分法不斷縮小經緯度區間，經度區間二分為[-180, 0]和[0,180],緯度區間二分為[-90,0]和[0,90]，偶數位放經度，奇數位放緯度交叉，將二進制數每五位轉化為十進制，在對應編碼表進行32位編碼

2、geohash_encode編碼函數

def geohash_encode(latitude, longitude, precision = 12):
    lat_interval, lon_interval = (-90.0, 90.0), (-180, 180)
    base32 = '0123456789bcdefghjkmnpqrstuvwxyz'
    geohash = []
    bits = [16, 8, 4, 2, 1]
    bit = 0
    ch = 0
    even = True
    while len(geohash)  precision:
        if even:
            mid = (lon_interval[0] + lon_interval[1]) / 2
            if longitude > mid:
                ch |= bits[bit]
                lon_interval = (mid, lon_interval[1])
            else:
                lon_interval = (lon_interval[0], mid)
        else:
            mid = (lat_interval[0] + lat_interval[1]) / 2
            if latitude > mid:
                ch |= bits[bit]
                lat_interval = (mid, lat_interval[1])
            else:
                lat_interval = (lat_interval[0], mid)
        even = not even
        if bit  4:
            bit += 1
        else:
            geohash += base32[ch]
            bit = 0
            ch = 0
    return ''.join(geohash)

到此這篇關于python爬蟲之地理數據分析的文章就介紹到這了,更多相關python地理數據內容請搜索腳本之家以前的文章或繼續瀏覽下面的相關文章希望大家以后多多支持腳本之家！

您可能感興趣的文章: