西刺ip爬虫.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167
  1. from bs4 import BeautifulSoup
  2. import requests
  3. import random
  4. def getip():
  5. ip = ['163.204.241.158 :9999',
  6. '182.35.87.174:9999',
  7. '182.35.87.175:9999',
  8. '182.35.87.176:9999',
  9. '182.35.87.177:9999',
  10. '123.163.96.131:9999',
  11. '1.197.16.130:9999',
  12. '182.35.85.157:9999',
  13. '163.204.244.212:9999',
  14. '163.204.243.252:9999',
  15. '117.91.132.170:9999',
  16. '58.253.156.1:9999',
  17. '1.197.16.129:9999',
  18. '163.204.246.126:9999',
  19. '60.13.42.67:9999',
  20. '49.86.182.174:9999',
  21. '182.35.80.229:9999',
  22. '58.253.156.225:9999',
  23. '163.204.240.126:9999',
  24. '60.13.42.70:9999',
  25. '123.163.122.108:9999',
  26. '175.42.122.46:9999',
  27. '123.163.97.234:9999',
  28. '182.35.82.130:9999',
  29. '163.204.245.72:9999',
  30. '60.13.42.127:9999',
  31. '121.233.206.73:9999',
  32. '163.204.246.149:9999',
  33. '182.35.83.33:9999',
  34. '163.204.243.16:9999',
  35. '163.204.246.174:9999',
  36. '121.233.251.60:9999',
  37. '123.169.37.250:38677',
  38. '120.83.109.29:9999',
  39. '163.204.241.164:9999',
  40. '163.204.246.42:9999',
  41. '163.204.241.228:9999',
  42. '175.42.68.4:9999',
  43. '182.35.84.155:9999',
  44. '112.85.130.205:9999',
  45. '122.193.247.115:9999',
  46. '163.204.241.190:9999',
  47. '163.204.245.237:9999',
  48. '163.204.242.245:9999',
  49. '115.53.19.82:9999',
  50. '112.85.128.146:9999',
  51. '163.204.244.40:9999',
  52. '182.35.80.5:9999',
  53. '163.204.242.130:9999',
  54. '112.85.129.88:9999',
  55. '113.121.20.143:9999',
  56. '182.35.83.136:9999',
  57. '182.35.80.195:9999',
  58. '120.83.105.248:9999',
  59. '112.85.151.152:9999',
  60. '171.11.178.94:9999',
  61. '171.12.113.6:9999',
  62. '112.85.165.194:9999',
  63. '123.163.122.254:9999',
  64. '58.253.158.174:9999',
  65. '120.84.101.164:9999',
  66. '60.13.42.61:9999',
  67. '60.13.42.207:9999',
  68. '1.198.72.219:9999',
  69. '182.35.80.54:9999',
  70. '114.230.69.232:9999',
  71. '163.204.242.126:9999',
  72. '58.253.154.253:9999',
  73. '180.108.218.242:9999',
  74. '112.85.149.238:9999',
  75. '114.230.69.109:9999',
  76. '60.13.42.28:9999',
  77. '163.204.244.39:9999',
  78. '180.108.218.179:9999',
  79. '121.233.251.82:9999',
  80. '113.121.20.252:808',
  81. '120.83.111.43:9999',
  82. '182.35.86.234:9999',
  83. '182.35.83.200:9999',
  84. '60.13.42.142:9999',
  85. '120.83.98.106:9999',
  86. '117.91.130.10:9999',
  87. '111.226.188.146:8010',
  88. '180.119.68.222:9999',
  89. '123.163.96.170:9999',
  90. '60.13.42.57:9999',
  91. '113.121.23.248:9999',
  92. '222.189.144.147:9999',
  93. '60.13.42.172:9999',
  94. '183.128.167.248:8118',
  95. '182.35.86.217:9999',
  96. '60.13.42.38:9999',
  97. '222.89.32.141:8070',
  98. '183.157.84.221:8118',
  99. '222.189.191.34:9999',
  100. '123.163.122.129:9999',
  101. '121.233.227.214:9999',
  102. '180.119.141.163:9999']
  103. b = random.randint(0,97)
  104. print("ip:"+str(b))
  105. return {'http':'http://'+ip[b]}
  106. def getagent():
  107. list1 = [
  108. 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
  109. 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
  110. 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
  111. 'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50',
  112. 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
  113. 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; GTB7.0)',
  114. 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
  115. 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)',
  116. 'Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12',
  117. 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)']
  118. a=random.randint(0, 9)
  119. print("agent:"+str(a))
  120. return list1[a]
  121. def get_ip_list():
  122. ipnum = 3775
  123. url = 'http://www.xicidaili.com/nn/'
  124. for num in ipnum:
  125. if(ipnum == 3776):
  126. ipnum = 3775
  127. headers = {
  128. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
  129. 'Accept-Language': 'zh-CN,zh;q=0.8',
  130. 'Referer': 'https: // wuhan.anjuke.com / sale /?from=navigation',
  131. 'User-Agent': getagent()
  132. }
  133. web_data = requests.get(url,headers=headers)
  134. soup = BeautifulSoup(web_data.text, 'html.parser')
  135. ips = soup.find_all('tr')
  136. ip_list = []
  137. for i in range(1, len(ips)):
  138. ip_info = ips[i]
  139. tds = ip_info.find_all('td')
  140. ip_list.append(tds[1].text + ':' + tds[2].text)
  141. return ip_list
  142. def get_random_ip():
  143. ip_list = []
  144. proxy_list = []
  145. ip_list = get_ip_list()
  146. for ip in ip_list:
  147. proxy_list.append('http://' + ip)
  148. proxy_ip = random.choice(proxy_list)
  149. proxies = {'http': proxy_ip}
  150. return proxies
  151. if __name__ == '__main__':
  152. proxies = get_random_ip()
  153. print(proxies) # 函数get_ip_list(url, headers)传入url和headers,最后返回一个IP列表,列表的元素类似42.84.226.65:8888格式,这个列表包括国内髙匿代理IP网站首页所有IP地址和端口。
  154. headers = {
  155. 'User-Agent':'Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3 like Mac OS X; wo-SN) AppleWebKit/535.16.1 (KHTML, like Gecko) Version/4.0.5 Mobile/8B114 Safari/6535.16.1'
  156. }
  157. resp = requests.get('https://www.baidu.com',proxies = proxies,headers = headers)
  158. print(resp.text)
  159. print("结束")