Bläddra i källkod

爬取西刺网站的ip

deng 5 år sedan
förälder
incheckning
84b0476ae1
1 ändrade filer med 262 tillägg och 0 borttagningar
  1. 262 0
      xici.txt

+ 262 - 0
xici.txt

@@ -0,0 +1,262 @@
+import requests
+from  bs4 import BeautifulSoup
+from idna import unicode
+from urllib.request import urlopen
+from urllib.parse import quote
+import sys
+import time
+import imp
+import json
+from xpinyin import Pinyin#导入拼音模块
+import random
+
+def getlnglat(address):
+        """根据传入地名参数获取经纬度"""
+        url = 'http://api.map.baidu.com/geocoder/v2/'
+        output = 'json'#输出结果可以是json也可以是其他类型
+        ak = 'hnhLTrxaZPI7jFkIkqA1TuLUWWmHV7Q1'
+        add = quote(str(address))#有时候quote会出错KeyError,要先把quote里面转成字符串类型
+        uri = url + '?' + 'address=' + add + '&output=' + output + '&ak=' + ak#构建百度地图API查询需要的uri,uri与URL的区别见:https://www.zhihu.com/question/21950864
+
+#下面这块代码是为了防止因为请求次数过多而产生的错误:urllib2.URLError: <urlopen error [Errno 10060] >
+#如果出现该错误,就让程序暂停10秒(time.sleep(10)),继续重新获取该小区的坐标。
+#timeout=30是防止因为网络原因出现延迟错误
+
+        maxNum=5
+        for tries in range(maxNum):
+            try:
+                req = urlopen(uri,timeout=30)#设置timeout30秒,防止百度屏蔽,如果被屏蔽就等30秒
+            except:
+                if tries < (maxNum - 1):
+                    time.sleep(10)
+                    continue
+                else:
+                    print("Has tried %d times, all failed!", maxNum)
+                    break
+
+        res = req.read().decode()
+        print(res)
+        temp = json.loads(res)
+        lat = temp['result']['location']['lat']
+        lng = temp['result']['location']['lng']
+        return lat, lng
+def getagent():
+        list1 =['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
+                    'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
+                    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
+                    'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50',
+                    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)',
+                    'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; InfoPath.3)']
+        return list1[random.randint(0,5)]
+def getheaders():
+    headers = {  # 由于安居客网站的反爬虫,这里必须要设置header
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+        'Accept-Language': 'zh-CN,zh;q=0.8',
+        'Referer': 'https: // wuhan.anjuke.com / sale /?from=navigation',
+        'User-Agent': getagent()
+    }
+def getagent():
+    list1 = [
+        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
+        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
+        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
+        'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50',
+        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
+        'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; GTB7.0)',
+        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
+        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12',
+        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)']
+    a=random.randint(0, 9)
+    print(a)
+    return list1[a]
+def getip():
+    ip = ['1.197.203.148:9999',
+          '163.204.246.188:9999',
+          '163.204.246.80:9999',
+          '163.204.242.23:9999',
+          '182.34.36.23:25995',
+          '163.204.242.240	:9999',
+          '112.85.164.83:9999',
+          '163.204.246.253	:9999',
+          '120.83.110.1:9999',
+          '182.35.84.183:9999',
+          '182.35.81.201:9999',
+          '182.35.86.166:9999',
+          '182.35.84.154:9999',
+          '163.204.240.251:9999',
+          '163.204.243.205	:9999',
+          '163.204.247.2:9999',
+          '36.249.118.58:9999',
+          '163.204.241.167:9999',
+          '163.204.246.169	:9999']
+    b = random.randint(0,18)
+    print(b)
+    return ip[b]
+def gethtml():
+    # url = 'http://www.xicidaili.com/nn/'
+    # html = requests.get(url=url, headers=getheaders()).text
+    # soup = BeautifulSoup(html, 'html.parser')
+    # ips = soup.find_all(id='ip_list').find_all('tr')
+    # ip_list = []
+    # for i in range(1, len(ips)):
+    #     ip_info = ips[i]
+    #     tds = ip_info.find_all('td')
+    #     ip_list.append(tds[1].text + ':' + tds[2].text)
+    # # 随机获取列表中一个ip
+    # ip = random.choice(ip_list)
+    ip = getip()
+    response = requests.get('https://www.anjuke.com/sy-city.html', proxies={"http": ip}, timeout=10)
+    print(response.text)
+def getip():
+    ip = ['163.204.241.158	:9999',
+          '182.35.87.174:9999',
+          '182.35.87.175:9999',
+          '182.35.87.176:9999',
+          '182.35.87.177:9999',
+          '123.163.96.131:9999',
+          '1.197.16.130:9999',
+          '182.35.85.157:9999',
+          '163.204.244.212:9999',
+          '163.204.243.252:9999',
+          '117.91.132.170:9999',
+          '58.253.156.1:9999',
+          '1.197.16.129:9999',
+          '163.204.246.126:9999',
+          '60.13.42.67:9999',
+          '49.86.182.174:9999',
+          '182.35.80.229:9999',
+          '58.253.156.225:9999',
+          '163.204.240.126:9999',
+          '60.13.42.70:9999',
+          '123.163.122.108:9999',
+          '175.42.122.46:9999',
+          '123.163.97.234:9999',
+          '182.35.82.130:9999',
+          '163.204.245.72:9999',
+          '60.13.42.127:9999',
+          '121.233.206.73:9999',
+          '163.204.246.149:9999',
+          '182.35.83.33:9999',
+          '163.204.243.16:9999',
+          '163.204.246.174:9999',
+          '121.233.251.60:9999',
+          '123.169.37.250:38677',
+          '120.83.109.29:9999',
+          '163.204.241.164:9999',
+          '163.204.246.42:9999',
+          '163.204.241.228:9999',
+          '175.42.68.4:9999',
+          '182.35.84.155:9999',
+          '112.85.130.205:9999',
+          '122.193.247.115:9999',
+          '163.204.241.190:9999',
+          '163.204.245.237:9999',
+          '163.204.242.245:9999',
+          '115.53.19.82:9999',
+          '112.85.128.146:9999',
+          '163.204.244.40:9999',
+          '182.35.80.5:9999',
+          '163.204.242.130:9999',
+          '112.85.129.88:9999',
+          '113.121.20.143:9999',
+          '182.35.83.136:9999',
+          '182.35.80.195:9999',
+          '120.83.105.248:9999',
+          '112.85.151.152:9999',
+          '171.11.178.94:9999',
+          '171.12.113.6:9999',
+          '112.85.165.194:9999',
+          '123.163.122.254:9999',
+          '58.253.158.174:9999',
+          '120.84.101.164:9999',
+          '60.13.42.61:9999',
+          '60.13.42.207:9999',
+          '1.198.72.219:9999',
+          '182.35.80.54:9999',
+          '114.230.69.232:9999',
+          '163.204.242.126:9999',
+          '58.253.154.253:9999',
+          '180.108.218.242:9999',
+          '112.85.149.238:9999',
+          '114.230.69.109:9999',
+          '60.13.42.28:9999',
+          '163.204.244.39:9999',
+          '180.108.218.179:9999',
+          '121.233.251.82:9999',
+          '113.121.20.252:808',
+          '120.83.111.43:9999',
+          '182.35.86.234:9999',
+          '182.35.83.200:9999',
+          '60.13.42.142:9999',
+          '120.83.98.106:9999',
+          '117.91.130.10:9999',
+          '111.226.188.146:8010',
+          '180.119.68.222:9999',
+          '123.163.96.170:9999',
+          '60.13.42.57:9999',
+          '113.121.23.248:9999',
+          '222.189.144.147:9999',
+          '60.13.42.172:9999',
+          '183.128.167.248:8118',
+          '182.35.86.217:9999',
+          '60.13.42.38:9999',
+          '222.89.32.141:8070',
+          '183.157.84.221:8118',
+          '222.189.191.34:9999',
+          '123.163.122.129:9999',
+          '121.233.227.214:9999',
+          '180.119.141.163:9999']
+    b = random.randint(0,97)
+    print("ip:"+str(b))
+    return {'http':'http://'+ip[b]}
+def getagent():
+    list1 = [
+        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
+        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
+        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
+        'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50',
+        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
+        'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; GTB7.0)',
+        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
+        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12',
+        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)']
+    a=random.randint(0, 9)
+    print("agent:"+str(a))
+    return list1[a]
+def get_ip_list(num):
+
+    print(num)
+    url = 'http://www.xicidaili.com/nn/%d' %num
+    print(url)
+    headers = {
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+        'Accept-Language': 'zh-CN,zh;q=0.8',
+        'User-Agent': getagent()
+    }
+    web_data = requests.get(url, headers = headers,timeout=(10))
+    soup = BeautifulSoup(web_data.text, 'html.parser')
+    ips = soup.find_all('tr')
+    ip_list = []
+    file = open("西刺ip1.txt", 'a', encoding="utf-8", buffering=1)
+    for i in range(1, len(ips)):
+        ip_info = ips[i]
+        tds = ip_info.find_all('td')
+        strip = tds[1].text + ':' + tds[2].text
+        ip_list.append(strip)
+        print(strip+ ",")
+        file.write('\''+strip +'\''+ "," + '\n')
+def getip():
+    file = open("西刺ip.txt",'a',encoding='utf-8')
+    ip = []
+    for line in file:
+        ip.append(line)
+    b = random.randint(0,97)
+    print("ip:"+str(b))
+    return {'http':'http://'+ip[b]}
+if __name__ == '__main__':
+    for a in range(1,300):
+        get_ip_list(a)
+    # print(getip())
+    # print('<address>')