xici.txt 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263
  1. import requests
  2. from bs4 import BeautifulSoup
  3. from idna import unicode
  4. from urllib.request import urlopen
  5. from urllib.parse import quote
  6. import sys
  7. import time
  8. import imp
  9. import json
  10. from xpinyin import Pinyin#导入拼音模块
  11. import random
  12. def getlnglat(address):
  13. """根据传入地名参数获取经纬度"""
  14. url = 'http://api.map.baidu.com/geocoder/v2/'
  15. output = 'json'#输出结果可以是json也可以是其他类型
  16. ak = 'hnhLTrxaZPI7jFkIkqA1TuLUWWmHV7Q1'
  17. add = quote(str(address))#有时候quote会出错KeyError,要先把quote里面转成字符串类型
  18. uri = url + '?' + 'address=' + add + '&output=' + output + '&ak=' + ak#构建百度地图API查询需要的uri,uri与URL的区别见:https://www.zhihu.com/question/21950864
  19. #下面这块代码是为了防止因为请求次数过多而产生的错误:urllib2.URLError: <urlopen error [Errno 10060] >
  20. #如果出现该错误,就让程序暂停10秒(time.sleep(10)),继续重新获取该小区的坐标。
  21. #timeout=30是防止因为网络原因出现延迟错误
  22. maxNum=5
  23. for tries in range(maxNum):
  24. try:
  25. req = urlopen(uri,timeout=30)#设置timeout30秒,防止百度屏蔽,如果被屏蔽就等30秒
  26. except:
  27. if tries < (maxNum - 1):
  28. time.sleep(10)
  29. continue
  30. else:
  31. print("Has tried %d times, all failed!", maxNum)
  32. break
  33. res = req.read().decode()
  34. print(res)
  35. temp = json.loads(res)
  36. lat = temp['result']['location']['lat']
  37. lng = temp['result']['location']['lng']
  38. return lat, lng
  39. def getagent():
  40. list1 =['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
  41. 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
  42. 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
  43. 'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50',
  44. 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)',
  45. 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; InfoPath.3)']
  46. return list1[random.randint(0,5)]
  47. def getheaders():
  48. headers = { # 由于安居客网站的反爬虫,这里必须要设置header
  49. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
  50. 'Accept-Language': 'zh-CN,zh;q=0.8',
  51. 'Referer': 'https: // wuhan.anjuke.com / sale /?from=navigation',
  52. 'User-Agent': getagent()
  53. }
  54. def getagent():
  55. list1 = [
  56. 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
  57. 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
  58. 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
  59. 'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50',
  60. 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
  61. 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; GTB7.0)',
  62. 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
  63. 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)',
  64. 'Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12',
  65. 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)']
  66. a=random.randint(0, 9)
  67. print(a)
  68. return list1[a]
  69. def getip():
  70. ip = ['1.197.203.148:9999',
  71. '163.204.246.188:9999',
  72. '163.204.246.80:9999',
  73. '163.204.242.23:9999',
  74. '182.34.36.23:25995',
  75. '163.204.242.240 :9999',
  76. '112.85.164.83:9999',
  77. '163.204.246.253 :9999',
  78. '120.83.110.1:9999',
  79. '182.35.84.183:9999',
  80. '182.35.81.201:9999',
  81. '182.35.86.166:9999',
  82. '182.35.84.154:9999',
  83. '163.204.240.251:9999',
  84. '163.204.243.205 :9999',
  85. '163.204.247.2:9999',
  86. '36.249.118.58:9999',
  87. '163.204.241.167:9999',
  88. '163.204.246.169 :9999']
  89. b = random.randint(0,18)
  90. print(b)
  91. return ip[b]
  92. def gethtml():
  93. # url = 'http://www.xicidaili.com/nn/'
  94. # html = requests.get(url=url, headers=getheaders()).text
  95. # soup = BeautifulSoup(html, 'html.parser')
  96. # ips = soup.find_all(id='ip_list').find_all('tr')
  97. # ip_list = []
  98. # for i in range(1, len(ips)):
  99. # ip_info = ips[i]
  100. # tds = ip_info.find_all('td')
  101. # ip_list.append(tds[1].text + ':' + tds[2].text)
  102. # # 随机获取列表中一个ip
  103. # ip = random.choice(ip_list)
  104. ip = getip()
  105. response = requests.get('https://www.anjuke.com/sy-city.html', proxies={"http": ip}, timeout=10)
  106. print(response.text)
  107. def getip():
  108. ip = ['163.204.241.158 :9999',
  109. '182.35.87.174:9999',
  110. '182.35.87.175:9999',
  111. '182.35.87.176:9999',
  112. '182.35.87.177:9999',
  113. '123.163.96.131:9999',
  114. '1.197.16.130:9999',
  115. '182.35.85.157:9999',
  116. '163.204.244.212:9999',
  117. '163.204.243.252:9999',
  118. '117.91.132.170:9999',
  119. '58.253.156.1:9999',
  120. '1.197.16.129:9999',
  121. '163.204.246.126:9999',
  122. '60.13.42.67:9999',
  123. '49.86.182.174:9999',
  124. '182.35.80.229:9999',
  125. '58.253.156.225:9999',
  126. '163.204.240.126:9999',
  127. '60.13.42.70:9999',
  128. '123.163.122.108:9999',
  129. '175.42.122.46:9999',
  130. '123.163.97.234:9999',
  131. '182.35.82.130:9999',
  132. '163.204.245.72:9999',
  133. '60.13.42.127:9999',
  134. '121.233.206.73:9999',
  135. '163.204.246.149:9999',
  136. '182.35.83.33:9999',
  137. '163.204.243.16:9999',
  138. '163.204.246.174:9999',
  139. '121.233.251.60:9999',
  140. '123.169.37.250:38677',
  141. '120.83.109.29:9999',
  142. '163.204.241.164:9999',
  143. '163.204.246.42:9999',
  144. '163.204.241.228:9999',
  145. '175.42.68.4:9999',
  146. '182.35.84.155:9999',
  147. '112.85.130.205:9999',
  148. '122.193.247.115:9999',
  149. '163.204.241.190:9999',
  150. '163.204.245.237:9999',
  151. '163.204.242.245:9999',
  152. '115.53.19.82:9999',
  153. '112.85.128.146:9999',
  154. '163.204.244.40:9999',
  155. '182.35.80.5:9999',
  156. '163.204.242.130:9999',
  157. '112.85.129.88:9999',
  158. '113.121.20.143:9999',
  159. '182.35.83.136:9999',
  160. '182.35.80.195:9999',
  161. '120.83.105.248:9999',
  162. '112.85.151.152:9999',
  163. '171.11.178.94:9999',
  164. '171.12.113.6:9999',
  165. '112.85.165.194:9999',
  166. '123.163.122.254:9999',
  167. '58.253.158.174:9999',
  168. '120.84.101.164:9999',
  169. '60.13.42.61:9999',
  170. '60.13.42.207:9999',
  171. '1.198.72.219:9999',
  172. '182.35.80.54:9999',
  173. '114.230.69.232:9999',
  174. '163.204.242.126:9999',
  175. '58.253.154.253:9999',
  176. '180.108.218.242:9999',
  177. '112.85.149.238:9999',
  178. '114.230.69.109:9999',
  179. '60.13.42.28:9999',
  180. '163.204.244.39:9999',
  181. '180.108.218.179:9999',
  182. '121.233.251.82:9999',
  183. '113.121.20.252:808',
  184. '120.83.111.43:9999',
  185. '182.35.86.234:9999',
  186. '182.35.83.200:9999',
  187. '60.13.42.142:9999',
  188. '120.83.98.106:9999',
  189. '117.91.130.10:9999',
  190. '111.226.188.146:8010',
  191. '180.119.68.222:9999',
  192. '123.163.96.170:9999',
  193. '60.13.42.57:9999',
  194. '113.121.23.248:9999',
  195. '222.189.144.147:9999',
  196. '60.13.42.172:9999',
  197. '183.128.167.248:8118',
  198. '182.35.86.217:9999',
  199. '60.13.42.38:9999',
  200. '222.89.32.141:8070',
  201. '183.157.84.221:8118',
  202. '222.189.191.34:9999',
  203. '123.163.122.129:9999',
  204. '121.233.227.214:9999',
  205. '180.119.141.163:9999']
  206. b = random.randint(0,97)
  207. print("ip:"+str(b))
  208. return {'http':'http://'+ip[b]}
  209. def getagent():
  210. list1 = [
  211. 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
  212. 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
  213. 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
  214. 'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50',
  215. 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
  216. 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; GTB7.0)',
  217. 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
  218. 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)',
  219. 'Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12',
  220. 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)']
  221. a=random.randint(0, 9)
  222. print("agent:"+str(a))
  223. return list1[a]
  224. def get_ip_list(num):
  225. print(num)
  226. url = 'http://www.xicidaili.com/nn/%d' %num
  227. print(url)
  228. headers = {
  229. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
  230. 'Accept-Language': 'zh-CN,zh;q=0.8',
  231. 'User-Agent': getagent()
  232. }
  233. web_data = requests.get(url, headers = headers,timeout=(10))
  234. soup = BeautifulSoup(web_data.text, 'html.parser')
  235. ips = soup.find_all('tr')
  236. ip_list = []
  237. file = open("西刺ip1.txt", 'a', encoding="utf-8", buffering=1)
  238. for i in range(1, len(ips)):
  239. ip_info = ips[i]
  240. tds = ip_info.find_all('td')
  241. strip = tds[1].text + ':' + tds[2].text
  242. ip_list.append(strip)
  243. print(strip+ ",")
  244. file.write('\''+strip +'\''+ "," + '\n')
  245. def getip():
  246. file = open("西刺ip.txt",'a',encoding='utf-8')
  247. ip = []
  248. for line in file:
  249. ip.append(line)
  250. b = random.randint(0,97)
  251. print("ip:"+str(b))
  252. return {'http':'http://'+ip[b]}
  253. if __name__ == '__main__':
  254. for a in range(1,300):
  255. get_ip_list(a)
  256. # print(getip())
  257. # print('<address>')