Search for proxies via python~

#!/usr/bin/env python
"""Get proxies from urls, and test their speed"""
import urllib, re, time, threading
urls = ["http://www.hitchina.net/taxonomy/term/14",
"http://www.hitchina.net/taxonomy/term/15",
"http://www.ipbbs.com/",
"http://www.pass-e.com/proxy/",
"http://www.haozs.net/ip.htm"
]		#where to get proxies
urls_proxy = {}		#proxy used to connect urls
proxy_pattern = re.compile(r"""\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,}""")
test_url = "http://setiathome.berkeley.edu/"
test_pattern = re.compile(r"""SETI@home""")
time_out = 30.0		#max waiting time to test proxies
output_file = "Proxies.txt"
template = "UserdefinedTemplate.pac"
pac_pattern = re.compile(r"""MyProxy""")
pac_file = "proxy.pac"
advProxy = ["http://219.217.250.3:3128"]
 
class TestTime(threading.Thread):
"""test a proxy's speed in new thread by recording its connect time"""
def __init__(self, proxy):
threading.Thread.__init__(self)
self.proxy = proxy
self.time = None
self.stat = proxy + " time out!"
def run(self):
start = time.time()
try:
f = urllib.urlopen(test_url, proxies = {"http":"http://"+self.proxy})
except:
self.stat = self.proxy+" fails!"
else:
data = f.read()
f.close()
end = time.time()
if test_pattern.search(data): #if data is matched
self.time = end-start
self.stat = self.proxy+" time: "+str(self.time)
else:
self.stat = self.proxy+" not matched!"
 
def totest(proxy, result):
"""test a proxy's speed in time_out seconds"""
test = TestTime(proxy)
test.setDaemon(True)
#print "testing "+proxy
test.start()
test.join(time_out) 	#wait time_out seconds for testing
#print test.stat
if test.time:
result.append((test.time, proxy))
 
if __name__ == "__main__":
#get old proxies in output_file
try:
f = open(output_file)
except:
allproxies = set()
else:
allproxies = set([x[:-1] for x in f.readlines()])
f.close()
 
#get else proxies from urls
for url in urls:
print "getting proxy from "+url
try:
f = urllib.urlopen(url, proxies=urls_proxy)
except:
pass
#print url+" can not open!\n"
else:
data = f.read()
f.close()
allproxies.update(proxy_pattern.findall(data))
print url+" finished!"
 
#test all proxies' speed
result = []
for proxy in allproxies:
#new thread to test every proxy
t = threading.Thread(target=totest, args=(proxy, result))
t.setDaemon(True)
t.start()
 
#show all proxies' speed
time.sleep(time_out+5.0)
result.sort()
#bestProxy = result[0]
#print bestProxy
bestProxy = ""
candProxy = [result[i][1] for i in xrange(len(result))]
for adv in advProxy:
if adv in candProxy:
bestProxy = adv
if not bestProxy:
for i in xrange(len(result)):
bestProxy = result[i][1]
break
#print bestProxy
#print str(i+1)+"\t"+result[i][1]+"   \t:\t"+str(result[i][0])
#output needed proxies
#num = min(abs(int(raw_input("\nHow many proxies to output: "))), len(result))
try:
f = open(output_file, "w")
except:
print "Can not open output file!"
else:
f.writelines(proxy + "\n" for proxy in allproxies)
#f.writelines([x[1]+"\n" for x in result[:num]])
f.close()
#print str(num)+" proxies are output."
try:
f = open(template, "r")
except:
print "Can not open template file!"
else:
data = f.read()
f.close()
#data = re.sub(r"MyProxy", bestProxy, data)
data = pac_pattern.sub(bestProxy, data)
try:
f = open(pac_file, "w")
except:
print"Can not open pac file!"
else:
f.write(data)
f.close()

相关日志


You can follow any responses to this entry through the RSS 2.0 feed. You can leave a response, or trackback from your own site.

Leave a Reply