python爬虫初级入门

最近一个项目,需要用到地区按拼音索引。在网上没有找到合适的数据。决定用python写个爬虫脚本,爬取美团的城市列表,然后在请求高德行政区域查询接口补充数据。

import urllib.request
from bs4 import BeautifulSoup
import json
import requests

#根据城市名称通过高德行政区查询接口获取城市编码和经纬度
def getLocation(name):
    params = {'key': '809e7cf228d47949c89155055f5ceba9', 'keywords': name}
    url = 'https://restapi.amap.com/v3/config/district'
    res = requests.get(url, params)
    res_dict = json.loads(res.content)
    if len(res_dict['districts']) > 0 and int(res_dict['count']) > 0:
        city_code = res_dict['districts'][0]['citycode']
        center = res_dict['districts'][0]['center']
        local = center.partition(",")
        if len(local) > 0:
            return city_code, local[0], local[2]

response = urllib.request.urlopen("https://www.meituan.com/changecity/") #美团的城市列表页
html = response.read().decode("utf-8") #将页面编码转为utf-8
bf = BeautifulSoup(html, 'html.parser') #解析页面内容
labels = bf.find_all('span', class_ = 'city-label')#查询首字母标识
city_list = []
for label in labels:
    letter = label.get_text()
    datas = bf.select('#city-'+letter) #查询城市信息
    for item in datas:
        citys = []
        for city in item.find_all(class_ = 'city'): #轮询城市名称
            if len(city) > 0:
                info = getLocation(city) #调用高德查询接口函数
                try:
                    citys.append({'name': city.get_text(), 'code': info[0], 'Longitude': info[1], 'Latitude': info[2]}) #将城市信息添加到城市列表中
                except Exception:
                    print(info)
        city_list.append({'letter':letter, 'data': citys})
print(json.dumps(city_list))