Собственно, вот код. Покритикуйте.
Аргумент программы - требуемое число проксей. Выводит их список print'ом, когда наберет это количество.
#!/usr/bin/python
# -*- coding: utf-8 -*-
import fpclib
import time
import thread
import time
import sys
if len(sys.argv) < 2:
print "%s [N]" % (sys.argv[0])
sys.exit()
N = int(sys.argv[1])
proxies = []
keywords = u"free proxy"
ports = [(3128,"http"), (1080,"socks4")]
TIMEOUT_GET = 10 # сколько секунд ждать ответа от Google
TIMEOUT_GOOGLE = 2.0 # сколько ждать после обработки всех сайтов(ip), перед следующим запросом к Google
TIMEOUT_CHECK = 15 # сколько ждать прокси-сервер (ответ от него)
TIMEOUT_WHILE = 0.2 # таймаут в цикле ожидания выхода (пока все "висящие" ip не обработаются)
N_T = 50
n_t = 0
def func(proxy_hostname, proxy_port, proxy_type, timeout):
global n_t
global proxies
if fpclib.check_proxy(proxy_hostname, proxy_port, proxy_type, timeout):
s = "%s:%d" % (ip, proxy_port)
print "ACCEPT %s" % (s)
if not s in proxies:
proxies.append(s)
print proxies
n_t = n_t - 1
i = 0
while True:
url = fpclib.geturl_google_text(keywords,i)
html = fpclib.gethttp_pycurl(url,TIMEOUT_GET)
sites = fpclib.cuturi_google(html)
print sites
for url_site in sites:
html = fpclib.gethttp_pycurl(url_site,TIMEOUT_GET)
ips = fpclib.cutip(html)
print len(html), ips
for ip in ips:
for proxy_port, proxy_type in ports:
while n_t > N_T:
time.sleep(TIMEOUT_WHILE)
n_t = n_t + 1
thread.start_new(func, (ip, proxy_port, proxy_type, TIMEOUT_CHECK,))
if len(proxies) >= N: break
if len(proxies) >= N: break
if len(proxies) >= N: break
if len(proxies) >= N: break
i = i + 1
time.sleep(TIMEOUT_GOOGLE) # seconds
while n_t > 0:
time.sleep(TIMEOUT_WHILE)
for proxy in proxies:
print proxy
fpclib.py:
#!/usr/bin/python
# -*- coding: utf-8 -*-
from StringIO import StringIO
import pycurl
import re
def gethttp_pycurl_f(url,f,timeout=1,proxy_type="none",proxy_hostname="localhost",proxy_port=3128):
curl = pycurl.Curl()
curl.setopt(pycurl.URL, url)
curl.setopt(pycurl.WRITEDATA, f)
curl.setopt(pycurl.FOLLOWLOCATION, 1)
curl.setopt(pycurl.MAXREDIRS, 5)
curl.setopt(pycurl.CONNECTTIMEOUT, timeout)
curl.setopt(pycurl.TIMEOUT, timeout)
curl.setopt(pycurl.NOSIGNAL, 1)
if proxy_type == "http":
curl.setopt(pycurl.PROXY,proxy_hostname)
curl.setopt(pycurl.PROXYPORT,proxy_port)
curl.setopt(pycurl.PROXYTYPE,pycurl.PROXYTYPE_HTTP)
elif proxy_type == "socks4":
curl.setopt(pycurl.PROXY,proxy_hostname)
curl.setopt(pycurl.PROXYPORT,proxy_port)
curl.setopt(pycurl.PROXYTYPE,pycurl.PROXYTYPE_SOCKS4)
elif proxy_type == "socks5":
curl.setopt(pycurl.PROXY,proxy_hostname)
curl.setopt(pycurl.PROXYPORT,proxy_port)
curl.setopt(pycurl.PROXYTYPE,pycurl.PROXYTYPE_SOCKS5)
try:
curl.perform()
except Exception,e:
print "Exception: %s" % str(e)
curl.close()
def gethttp_pycurl(url,timeout=1,proxy_type="none",proxy_hostname="localhost",proxy_port=3128):
body = StringIO()
curl = pycurl.Curl()
curl.setopt(pycurl.URL, url)
curl.setopt(pycurl.WRITEFUNCTION, body.write)
curl.setopt(pycurl.FOLLOWLOCATION, 1)
curl.setopt(pycurl.MAXREDIRS, 5)
curl.setopt(pycurl.CONNECTTIMEOUT, timeout)
curl.setopt(pycurl.TIMEOUT, timeout)
curl.setopt(pycurl.NOSIGNAL, 1)
if proxy_type == "http":
curl.setopt(pycurl.PROXY,proxy_hostname)
curl.setopt(pycurl.PROXYPORT,proxy_port)
curl.setopt(pycurl.PROXYTYPE,pycurl.PROXYTYPE_HTTP)
elif proxy_type == "socks4":
curl.setopt(pycurl.PROXY,proxy_hostname)
curl.setopt(pycurl.PROXYPORT,proxy_port)
curl.setopt(pycurl.PROXYTYPE,pycurl.PROXYTYPE_SOCKS4)
elif proxy_type == "socks5":
curl.setopt(pycurl.PROXY,proxy_hostname)
curl.setopt(pycurl.PROXYPORT,proxy_port)
curl.setopt(pycurl.PROXYTYPE,pycurl.PROXYTYPE_SOCKS5)
try:
curl.perform()
except Exception,e:
print "Exception: %s" % str(e)
page = body.getvalue()
curl.close()
return page
import urllib
def geturl_rambler_text(text,i):
site = "http://www.rambler.ru/lite"
url = urllib.urlencode([
("oe","1251"),
("words",text.encode("cp1251")),
("start",str(int(i)*10+1))])
return site+"?"+url
def geturl_rambler_images(text,i):
site = "http://nova.rambler.ru/pictures"
url = urllib.urlencode([
("query",text.encode("utf-8"))])
return site+"?"+url
def geturl_yandex_text(text,i):
site = "http://yandex.ru/yandsearch"
url = urllib.urlencode([
("rpt","rad"),
("text",text.encode("utf-8")),
("p",str(int(i)))])
return site+"?"+url
def geturl_yandex_images(text,i):
site = "http://images.yandex.ru/yandsearch"
url = urllib.urlencode([
("stype","image"),
("text",text.encode("utf-8"))]) # utf-8
return site+"?"+url
def geturl_google_text(text,i):
site = "http://www.google.ru/search"
url = urllib.urlencode([
("hl","ru"),
("q",text.encode("utf-8")),
#("q",text),
("start",str(int(i)*10))])
return site+"?"+url
def geturl_google_images(text,i):
site = "http://images.google.ru/images"
url = urllib.urlencode([
("hl","ru"),
("gbv","2"),
("btnG","%D0%9F%D0%BE%D0%B8%D1%81%D0%BA+%D0%BA%D0%B0%D1%80%D1%82%D0%B8%D0%BD%D0%BE%D0%BA"),
("q",text.encode("utf-8"))])
return site+"?"+url
def cuturi(text):
a = re.findall("((?:http://|https://|ftp://|gopher://|mailto:|xmpp:)(?:[\w\.]+:\d+)?(?:[^\"\'\t\n\r< >:]+))",text)
return a
def cuturi_google(text):
a = re.findall("((?:http://|https://|ftp://|gopher://|mailto:|xmpp:)(?:[\w\.]+:\d+)?(?:[^\"\'\t\n\r< >:]+\" target))",text)
text = ' '.join(a)
return cuturi(text)
def cutip(text):
a = re.findall("(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})",text)
return a
def check_proxy(proxy_hostname, proxy_port, proxy_type="http", timeout = 30):
url= "http://ya.ru/"
html = gethttp_pycurl(url,timeout,proxy_type,proxy_hostname,proxy_port)
if re.search("http://yandex.ru/", html) <> None:
res = True
else:
res = False
return res
Страница проекта: http://php.kirovnet.ru/fpcheck.html