Parcourir la source

爬取广州市市监管局药品-爬虫

deng il y a 5 ans
Parent
commit
2727da6dae
1 fichiers modifiés avec 110 ajouts et 0 suppressions
  1. 110 0
      yaopin.txt

+ 110 - 0
yaopin.txt

@@ -0,0 +1,110 @@
+import requests
+from  bs4 import BeautifulSoup
+import random
+def get_ip_list():
+    num = random.randint(1,3000)
+    print(num)
+    url = 'http://www.xicidaili.com/nn/%d' %num
+    print(url)
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
+    }
+    web_data = requests.get(url, headers=headers)
+    soup = BeautifulSoup(web_data.text, 'html.parser')
+    ips = soup.find_all('tr')
+    ip_list = []
+    for i in range(1, len(ips)):
+        ip_info = ips[i]
+        tds = ip_info.find_all('td')
+        ip_list.append(tds[1].text + ':' + tds[2].text)
+    return ip_list
+def get_random_ip():
+    ip_list = []
+    proxy_list = []
+    ip_list = get_ip_list()
+    for ip in ip_list:
+        proxy_list.append('http://' + ip)
+    proxy_ip = random.choice(proxy_list)
+    proxies = {'http': proxy_ip}
+    return proxies
+def getagent():
+    list1 = [
+        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
+        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
+        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
+        'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50',
+        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
+        'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; GTB7.0)',
+        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
+        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12',
+        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)']
+    a=random.randint(0, 9)
+    print("agent:"+str(a))
+    return list1[a]
+def main(li):
+    url="http://ypgl.gzfda.gov.cn/pj18/jsp/pj18/portal/drug/medi/MediDataTable.xhtml"
+    formdata={
+    'YPMC':'',
+    'approve_no':'',
+    'trade_name':'',
+    'factoryLabel':'',
+    'factory':'',
+    'base_drug_typeLabel':'',
+    'base_drug_type':'',
+    'pageSize':100,
+    'pageNum':li,
+    'tableId':'BS_MEDI_sec',
+    'primaryKey':'serial_medi',
+    'checkable':'true',
+    'lineNumEnabled':'false',
+    'recordNumEnabled': 'false',
+    'refer':'/jsp/pj18/portal/drug/medi/Medi.fs'
+    }
+    headers = {#由于安居客网站的反爬虫,这里必须要设置header
+                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+                'Accept-Language': 'zh-CN,zh;q=0.8',
+                'Referer': 'https: // wuhan.anjuke.com / sale /?from=navigation',
+                'User-Agent': getagent(),
+                'cookie':"JSESSIONID=729E613D9D5535FA6BDF0EB909A6B492"
+            }
+
+    response=requests.post(url,data=formdata,headers=headers)
+    bs = BeautifulSoup(response.text, 'html.parser')
+    lis = bs.find_all('tr')
+    file = open("outPut/市监管局药品.txt",'a',encoding="utf-8",buffering=1)
+    for li in lis:
+        try:
+            piZhunWenHao = li.find(n='approve_no').get_text()
+            yaoPinBianMa = li.find(n="SERIAL_MEDI").get_text()
+            yaoPinMingCheng = li.find(n="YPMC").get_text()
+            shangPingMing = li.find(n="trade_name").get_text()
+            guiGe = li.find(n="SPEC_NAME").get_text()
+            baoZhuangGuiGe = li.find(n="BZGG").get_text()
+            baoZhuangDanWei = li.find(n="MIN_UNIT_NAME").get_text()
+            shengChanDanWei = li.find(n="factory_name").get_text()
+            yaoWuBiaoShi = li.find(n="BASE_DRUG_TYPE").get_text()
+            print(str(piZhunWenHao).strip() +","+
+                  str(yaoPinBianMa).strip() +","+
+                  str(yaoPinMingCheng).strip() +","+
+                  str(shangPingMing).strip() +","+
+                  str(guiGe).strip() +","+
+                  str(baoZhuangGuiGe).strip() +","+
+                  str(baoZhuangDanWei).strip() +","+
+                  str(shengChanDanWei).strip() +","+
+                  str(yaoWuBiaoShi).strip())
+            file.write(str(piZhunWenHao).strip() +","+
+                  str(yaoPinBianMa).strip() +","+
+                  str(yaoPinMingCheng).strip() +","+
+                  str(shangPingMing).strip() +","+
+                  str(guiGe).strip() +","+
+                  str(baoZhuangGuiGe).strip() +","+
+                  str(baoZhuangDanWei).strip() +","+
+                  str(shengChanDanWei).strip() +","+
+                  str(yaoWuBiaoShi).strip()+"\n")
+        except:
+            print("未知错误")
+
+if __name__ == '__main__':
+    for li in range(1,1087):
+        main(li)