
半生月
V1
2023/05/11阅读:13主题:橙心
数据资源整理【二】:整理329万多条姓名数据sqlite3、Excel、csv文件【文末下载链接】
数据资源整理【二】:整理329万多条姓名数据sqlite3、Excel、csv文件【文末下载链接】
数据说明
:
# 解析百家姓列表,获取姓名对应的地址
url = "http://www.resgain.net/xmdq.html"
res = requests.get(url)
soup = BeautifulSoup(res.text, 'lxml')
name_links = []
for s in soup.find_all(attrs={'class': 'btn btn2'}):
name_link_dict = {
"name": s.text,
"link": "https://www.resgain.net/" + s.get('href')
}
name_links.append(name_link_dict)
return name_links
def get_data(first_name, url):
# 连接数据库
con = sqlite3.connect(r'tools_app.db')
cursor = con.cursor()
# 获取数据,并解析数据
res = requests.get(url)
soup = BeautifulSoup(res.text, 'lxml')
if "gender=1" in url:
sex_ = "男"
else:
sex_ = "女"
for s in soup.find_all(attrs={'class': 'cname'}):
name = s.text
# print(s.text)
sql = "insert into names (first_name,name,sex) values('{0}','{1}','{2}');".format(first_name, name, sex_)
cursor.execute(sql)
con.commit()
print(url, "完成")
con.close()
def create_db():
# 创建 sqlite3 数据库
conn = sqlite3.connect(r"tools_app.db")
cursor = conn.cursor()
cursor.execute('''CREATE TABLE if not exists names
(id INTEGER PRIMARY KEY AUTOINCREMENT,
first_name TEXT NOT NULL,
name TEXT NOT NULL,
sex TEXT NOT NULL);''')
print("names database created successfully")
conn.commit()
conn.close()
if __name__ == '__main__':
# 创建数据库和RANDOM_NAME表
create_db()
# 获取百家姓连接地址
name_link_list = get_name_link()
# print(name_link_list)
for name_link in name_link_list[70:]:
# 拼接男生和女生的地址
link = name_link.get("link")
first_name = name_link.get("name")
url_boys = link + "&gender=1&wx1=&wx2="
url_girls = link + "&gender=0&wx1=&wx2="
# 获取数据,并保存到 tools_app.db 中
get_data(first_name, url_boys)
# 每次获取完成后,随机暂停几秒
t = random.randint(1, 3)
time.sleep(t)
get_data(first_name, url_girls)
t = random.randint(1, 3)
time.sleep(t)
print(first_name,"完成!")
# break
下载地址
作者介绍

半生月
V1
微信公众号【帅帅的Python】