yaopin.txt 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110
  1. import requests
  2. from bs4 import BeautifulSoup
  3. import random
  4. def get_ip_list():
  5. num = random.randint(1,3000)
  6. print(num)
  7. url = 'http://www.xicidaili.com/nn/%d' %num
  8. print(url)
  9. headers = {
  10. 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
  11. }
  12. web_data = requests.get(url, headers=headers)
  13. soup = BeautifulSoup(web_data.text, 'html.parser')
  14. ips = soup.find_all('tr')
  15. ip_list = []
  16. for i in range(1, len(ips)):
  17. ip_info = ips[i]
  18. tds = ip_info.find_all('td')
  19. ip_list.append(tds[1].text + ':' + tds[2].text)
  20. return ip_list
  21. def get_random_ip():
  22. ip_list = []
  23. proxy_list = []
  24. ip_list = get_ip_list()
  25. for ip in ip_list:
  26. proxy_list.append('http://' + ip)
  27. proxy_ip = random.choice(proxy_list)
  28. proxies = {'http': proxy_ip}
  29. return proxies
  30. def getagent():
  31. list1 = [
  32. 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
  33. 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
  34. 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
  35. 'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50',
  36. 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
  37. 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; GTB7.0)',
  38. 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
  39. 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)',
  40. 'Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12',
  41. 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)']
  42. a=random.randint(0, 9)
  43. print("agent:"+str(a))
  44. return list1[a]
  45. def main(li):
  46. url="http://ypgl.gzfda.gov.cn/pj18/jsp/pj18/portal/drug/medi/MediDataTable.xhtml"
  47. formdata={
  48. 'YPMC':'',
  49. 'approve_no':'',
  50. 'trade_name':'',
  51. 'factoryLabel':'',
  52. 'factory':'',
  53. 'base_drug_typeLabel':'',
  54. 'base_drug_type':'',
  55. 'pageSize':100,
  56. 'pageNum':li,
  57. 'tableId':'BS_MEDI_sec',
  58. 'primaryKey':'serial_medi',
  59. 'checkable':'true',
  60. 'lineNumEnabled':'false',
  61. 'recordNumEnabled': 'false',
  62. 'refer':'/jsp/pj18/portal/drug/medi/Medi.fs'
  63. }
  64. headers = {#由于安居客网站的反爬虫,这里必须要设置header
  65. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
  66. 'Accept-Language': 'zh-CN,zh;q=0.8',
  67. 'Referer': 'https: // wuhan.anjuke.com / sale /?from=navigation',
  68. 'User-Agent': getagent(),
  69. 'cookie':"JSESSIONID=729E613D9D5535FA6BDF0EB909A6B492"
  70. }
  71. response=requests.post(url,data=formdata,headers=headers)
  72. bs = BeautifulSoup(response.text, 'html.parser')
  73. lis = bs.find_all('tr')
  74. file = open("outPut/市监管局药品.txt",'a',encoding="utf-8",buffering=1)
  75. for li in lis:
  76. try:
  77. piZhunWenHao = li.find(n='approve_no').get_text()
  78. yaoPinBianMa = li.find(n="SERIAL_MEDI").get_text()
  79. yaoPinMingCheng = li.find(n="YPMC").get_text()
  80. shangPingMing = li.find(n="trade_name").get_text()
  81. guiGe = li.find(n="SPEC_NAME").get_text()
  82. baoZhuangGuiGe = li.find(n="BZGG").get_text()
  83. baoZhuangDanWei = li.find(n="MIN_UNIT_NAME").get_text()
  84. shengChanDanWei = li.find(n="factory_name").get_text()
  85. yaoWuBiaoShi = li.find(n="BASE_DRUG_TYPE").get_text()
  86. print(str(piZhunWenHao).strip() +","+
  87. str(yaoPinBianMa).strip() +","+
  88. str(yaoPinMingCheng).strip() +","+
  89. str(shangPingMing).strip() +","+
  90. str(guiGe).strip() +","+
  91. str(baoZhuangGuiGe).strip() +","+
  92. str(baoZhuangDanWei).strip() +","+
  93. str(shengChanDanWei).strip() +","+
  94. str(yaoWuBiaoShi).strip())
  95. file.write(str(piZhunWenHao).strip() +","+
  96. str(yaoPinBianMa).strip() +","+
  97. str(yaoPinMingCheng).strip() +","+
  98. str(shangPingMing).strip() +","+
  99. str(guiGe).strip() +","+
  100. str(baoZhuangGuiGe).strip() +","+
  101. str(baoZhuangDanWei).strip() +","+
  102. str(shengChanDanWei).strip() +","+
  103. str(yaoWuBiaoShi).strip()+"\n")
  104. except:
  105. print("未知错误")
  106. if __name__ == '__main__':
  107. for li in range(1,1087):
  108. main(li)