123456789101112131415161718192021222324252627282930313233343536373839404142434445464748 |
- import json
- import requests
- import re
- from requests import RequestException
- def get_page(url):
- try:
- response = requests.get(url)
- if response.status_code == 200:
- print("状态:")
- return response.text
- print("状态3:")
- return None
- except RequestException:
- return None
- print("状态4:")
- def parse_page(html):
- # pattern = re.compile('<tr>.*?<th.*?>(.*?)<.*?<td.*?>(.*?)<.*?<td.*?>(.*?)</td>.*?>(.*?)</td>', re.S)
- # 疾病分类
- pattern = re.compile('<tr>.*?<th.*?>(.*?)</th>.*?>(.*?)</td>.*?>(.*?)</td>', re.S)
- items = re.findall(pattern,html)
- for item in items:
- yield {
- '药品名称': item[0],
- '超说明书适用症':item[1],
- '批准适应症': item[2],
- }
- def write_to_file(content):
- with open('超用药说明txt','a',encoding = 'utf-8')as f:
- f.write(json.dumps(content,ensure_ascii=False))
- def main():
- # url = "https://db.yaozh.com/icd?"
- # print(url)
- with open('data/超用药说明.txt', "r", encoding='utf-8') as f:
- sr = f.read()
- for item in parse_page(sr):
- print(item)
- write_to_file(item)
- if __name__ == '__main__':
- print("开始")
- main()
- print("结束")
|