Поиск по блогу

среда, 9 апреля 2014 г.

Пробуем PyProxy - комбайн 2010 года для сбора и проверки адресов

Простой модуль Python. Надеюсь, что он работает. В любом случае, начинать надо с простых примеров. Скачать можно PyProxy здесь. ...pyproxy is a Proxy hunter and Tester a high-level cross-protocol proxy-hunter python library.

Эти команды проверки и распаковки (разъархивации) пакетов tar.gz надо запомнить

In [1]:
# Check the packag before unpack
!tar -zvtf /home/kiss/Desktop/Temp/pyproxy-v.09.tar.gz
-rwxr-xr-x gunslinger/gunslinger 13170 2010-10-08 18:21 pyproxy.py


In [2]:
# Unpack in 'ipython notebook' folder
!tar -zxvf /home/kiss/Desktop/Temp/pyproxy-v.09.tar.gz
pyproxy.py


In [3]:
# iport to the next cell
%load pyproxy.py
In [3]:
#!/usr/bin/env python
#   This library is free software; you can redistribute it and/or
#   modify it under the terms of the GNU Lesser General Public
#   License as published by the Free Software Foundation; either
#   version 2.1 of the License, or (at your option) any later version.
#
#   This library is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
#   Lesser General Public License for more details.
#
#   You should have received a copy of the GNU Lesser General Public
#   License along with this library; if not, write to the 
#      Free Software Foundation, Inc., 
#      59 Temple Place, Suite 330, 
#      Boston, MA  02111-1307  USA
#
#   Copyright 2010 Gunslinger_ <yudha.gunslinger@gmail.com>
#   http://bit.ly/c0debreaker

import sys
import warnings
import urllib2
import re
import socket
import random
import optparse
import os
warnings.filterwarnings(action="ignore", message=".*(sets) module is deprecated", category=DeprecationWarning)
import sets

__author__ = "Gunslinger_ <yudha.gunslinger@gmail.com>"
__date__  = "Thu Oct  7 00:00:41 2010"
__version__  = "09"
__copyright__  = "Copyright (c) 2010 Gunslinger_"

class proxyhunter(object):
 """ 
 Instance variables:
  
 Outputproxy
  Output file every proxy will be printed in
  Default : proxylist.txt
 
 Goodproxy
  Output file all good proxy will be print
  Default : goodproxylist.txt
 
 Verbose
  More noise, every proxy will be print into screen
  Default : True
 Timeout
  Timeout every test proxy connections in socket
  Default : 30
 
 Sitelist
  Proxy site for parsing proxy
  Default : []
  
 """
 def __init__(self, OutputProxy='proxylist.txt', GoodProxy='goodproxylist.txt', Verbose=True, TimeOut=30, Sitelist=[]):
  self._red   = '\033[31m'
  self._reset   = '\033[0;0m'
  self._wide  = " "*50
  self._timeout  = TimeOut
  self._verbose  = Verbose
  self._testurl  = 'http://www.google.com'
  self._ouruseragent  = ['Mozilla/4.0 (compatible; MSIE 5.0; SunOS 5.10 sun4u; X11)',
     'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.2pre) Gecko/20100207 Ubuntu/9.04 (jaunty) Namoroka/3.6.2pre',
     'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser;',
     'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT 5.0)',
            'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.1)',
            'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.6)',
            'Microsoft Internet Explorer/4.0b1 (Windows 95)',
            'Opera/8.00 (Windows NT 5.1; U; en)',
     'amaya/9.51 libwww/5.4.0',
     'Mozilla/4.0 (compatible; MSIE 5.0; AOL 4.0; Windows 95; c_athome)',
     'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',
     'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
     'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; ZoomSpider.net bot; .NET CLR 1.1.4322)',
     'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; QihooBot 1.0 qihoobot@qihoo.net)',
     'Mozilla/4.0 (compatible; MSIE 5.0; Windows ME) Opera 5.11 [en]']
  self._referer  = ['http://google.com','http://bing.com']
  # You can add yours...   
  self._sitelist   = Sitelist 
  self._output  = OutputProxy
  self._goodproxy  = GoodProxy     
    
 def Samairdotru(self): 
  counter  = 1 
  proxycounter = 0 
  maxpages  = 60 
  urls   = [] 
  cntlen  = 0
  proxyfile = file(self._output, 'a') 
  print "[*] Hunting proxy from samair.ru please wait..." 
  while counter <= maxpages: 
   if counter <= 9: 
    opener = urllib2.build_opener(urllib2.HTTPHandler)
    opener.addheaders = [('User-agent', random.choice(self._ouruseragent)),
       ('Referer', random.choice(self._referer))]  
    urllib2.install_opener(opener)
    url = urllib2.urlopen('http://www.samair.ru/proxy/proxy-0'+repr(counter)+'.htm').read() 
   else: 
    opener = urllib2.build_opener(urllib2.HTTPHandler)
    opener.addheaders = [('User-agent', random.choice(self._ouruseragent)),
       ('Referer', random.choice(self._referer))]  
    urllib2.install_opener(opener)
    url = urllib2.urlopen('http://www.samair.ru/proxy/proxy-'+repr(counter)+'.htm').read() 
   proxies = re.findall(('\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}:\d{1,5}'), url) 
   lenstr = len(proxies)
   proxycounter = int(proxycounter) + int(len(proxies))
   sys.stdout.write("\r[*] %s%d%s Proxies received from : http://www.samair.ru/proxy/ %s" % (self._red, int(proxycounter), self._reset, self._wide))
   sys.stdout.flush()
   for singleproxy in proxies:
    if self._verbose:
     print singleproxy
    proxyfile.write(singleproxy+"\n")   
   counter = counter+1 
   opener.close()
  print "\n" 
  proxyfile.close()    
   
 def ParseProxy(self, site): 
  print "[*] Parse proxy from %s" % (site.split("//",3)[1])
         proxycounter  = 0 
         urls   = [] 
         proxyfile  = file(self._output, 'a') 
  opener   = urllib2.build_opener(urllib2.HTTPHandler)
  opener.addheaders = [('User-agent', random.choice(self._ouruseragent)),
     ('Referer', random.choice(self._referer))]  
  urllib2.install_opener(opener)
         url   = urllib2.urlopen(site).read() 
         proxies  = re.findall(('\d{1,3}[.]\d{1,3}[.]\d{1,3}[.]\d{1,3}[:]\d{1,5}'), url) 
         for singleproxy in proxies: 
          if self._verbose:
           print singleproxy
   proxyfile.write(singleproxy+"\n") 
                 proxycounter = proxycounter+1 
  sys.stdout.write("[*] %s%d%s Proxies receieved from : %s %s\n" % (self._red, int(proxycounter), self._reset, site.split("//",3)[1], self._wide))
  sys.stdout.flush()  
         opener.close() 
         proxyfile.close() 
   
 def Single(self):
  for site in self._sitelist:
   self.ParseProxy(site)
           
 def Cleanitup(self, sorted_output="uniqueproxylist.txt"): 
  """ proxy will be printed in uniqueproxylist.txt by default """
  proxyfile  = open(self._output, 'r').readlines() 
  outfile  = file(sorted_output, 'a') 
  sortproxy  = [] 
  finalcount = 0 
  for proxy in proxyfile: 
   if proxy not in sortproxy: 
    sortproxy.append(proxy) 
    outfile.write(proxy) 
    finalcount += 1 
  if self._verbose:
   for proxy in sortproxy:
    print proxy,
  print "\n[*] %s%d%s Unique proxy list has been sorted ." % (self._red, int(finalcount), self._reset),
  if sorted_output == "":
   print ""
  else:
   print "saved in %s" % (sorted_output)
   
  outfile.close() 

 def LoadProxy(self):
  global proxylist 
  try:
   preventstrokes = open(self._output, "r")
   proxylist      = preventstrokes.readlines()
   count          = 0 
   while count < len(proxylist): 
    proxylist[count] = proxylist[count].strip() 
    count += 1 
   print "[*] File successfully loaded..."
  except(IOError): 
     print "\n[-] Error: Check your proxylist path\n"
     sys.exit(1)
  
 def CoreFreshTester(self, proxy):
  try: 
   socket.setdefaulttimeout(self._timeout) 
   proxy    = proxy.split(":")
   proxy_info   = {
      'host' : proxy[0],
      'port' : int(proxy[1])
      } 
   proxy_support = urllib2.ProxyHandler({"http" : "%s:%d" % (proxy[0], int(proxy[1]))})
   opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler)
   opener.addheaders = [('User-agent', random.choice(self._ouruseragent)),
      ('Referer', random.choice(self._referer))]  
   urllib2.install_opener(opener)
   f = urllib2.urlopen(self._testurl)
   if self._verbose:
    print f.headers
    print f.read()
  except urllib2.HTTPError, e:     
   if self._verbose:   
           print 'Error : %s code : %s' % (e, e.code)
          return e.code
  except Exception, detail:
   if self._verbose:
    print "Error : %s" % (detail)
          return 1
         return 0 
          
 def MainFreshTester(self, proxy):
  if self.CoreFreshTester(proxy):
   print "[*] %s%s%s \n \'--------------> Bad" % (self._red, proxy, self._reset)
  else:
   print "[*] %s%s%s \n \'--------------> Good" % (self._red, proxy, self._reset)
   writegoodpxy.write(proxy) 
    
 def TestProxy(self):
  global writegoodpxy
  writegoodpxy  = file(self._goodproxy, 'w') 
  for proxy in proxylist:
   self.MainFreshTester(proxy)
  print "[*] All Fresh proxy has been saved in %s" % (self._goodproxy) 
  writegoodpxy.close() 

''' Direct use class of this library '''
class runengine(object):
 def __init__(self):
  self._sitelist  = ['http://www.proxy-list.net/anonymous-proxy-lists.shtml', 
     'http://www.digitalcybersoft.com/ProxyList/fresh-proxy-list.shtml', 
     'http://www.1proxyfree.com/', 
     'http://www.proxylists.net/http_highanon.txt', 
     'http://www.atomintersoft.com/products/alive-proxy/socks5-list/',
     'http://www.proxylist.net/',
     'http://aliveproxy.com/high-anonymity-proxy-list/',
     'http://spys.ru/en/',
     'http://spys.ru/en/http-proxy-list/',
     'http://atomintersoft.com/free_proxy_list',
     'http://aliveproxy.com/proxy-list/proxies.aspx/Indonesia-id',
    'http://tinnhanh.ipvnn.com/free-proxy/Indonesia_Proxy_List.ipvnn'] 
 def parseoption(self):
  global jSamairdotru, jSingle, jTestproxy, doall, version, output, proxytest, verbose, goodproxy, timeout  
  baseprog = os.path.basename(sys.argv[0])
  parser   = optparse.OptionParser()
  if len(sys.argv) <= 1:
   parser.exit(msg="""Usage : %s [option]
 -h or --help for get help \n\n""" % (sys.argv[0]))
  ''' parse for option '''  
  parser.add_option("-s", "--samair", 
      dest="jSamairdotru", 
      action="store_true",
                      help="just use samair.ru to hunt proxies")
  parser.add_option("-l", "--sitelist", dest="jSingle", action="store_true",
                    help="use all site in the list")   
  parser.add_option("-t", "--test", 
      dest="jTestproxy", 
      action="store_true",
                    help="test all proxy !") 
  parser.add_option("-a", "--all", 
      dest="doall", 
      action="store_true",
                    help="do all !")
  parser.add_option("-v", "--version", 
      dest="version", 
      action="store_true", 
                    help="print current proxy hunter version")
  parser.add_option("-d", "--debug", 
      dest="verbose", 
      action="store_true",
                    help="debug program for more talkable & every proxy will be print to screen")                    
  parser.add_option("-o", "--outputfile", 
      dest="outputfile", 
      default="proxylist.txt", 
      type="string", 
      action="store", 
      metavar="FILE",
                    help="output proxy will be print     [default : %default]" )                    
  parser.add_option("-i", "--inputfile", 
      dest="inputfile", 
      default="proxylist.txt", 
      type="string", 
      action="store", 
      metavar="FILE",
                    help="input proxy will be checked     [default : %default]")
  parser.add_option("-g", "--outputgood", 
      dest="outputgoodproxy", 
      default="goodproxy.txt", 
      type="string", 
      action="store", 
      metavar="FILE",
                    help="output all good proxy will be saved     [default : %default]")                    
  parser.add_option("-c", "--timeout", 
      dest="timeout", 
      default=30, 
      type="int", 
      action="store", 
      metavar="NUMBER",
                    help="timeout connections being program run    [default : %default]") 
  group = optparse.OptionGroup(parser, "Example ",
                      """%s -s   | Gather proxy with samair.ru
                      
                      %s -l   | Gather proxy in the url list 
                      
                      %s -t proxylist.txt | Test proxy inside proxylist.txt 
                      
                      %s -a   | Do all                        
                      
                      %s -v    | Print current version
                      """ % (baseprog, baseprog, baseprog, baseprog, baseprog))
  parser.add_option_group(group)                                                        
  (options, args) = parser.parse_args()
  jSamairdotru  = options.jSamairdotru
  jSingle  = options.jSingle
  jTestproxy = options.jTestproxy
  doall  = options.doall
  version  = options.version
  output  = options.outputfile
  proxytest = options.inputfile
  verbose  = options.verbose
  goodproxy = options.outputgoodproxy
  timeout  = options.timeout
  
 def printversion(self):
  print "Version : %s \n" % (__version__) 
          
 def run(self):
  proxyengine = proxyhunter(OutputProxy=output, GoodProxy=goodproxy, Verbose=verbose, TimeOut=timeout, Sitelist=self._sitelist) 
  if version:
   self.printversion()
  if jSamairdotru:
   proxyengine.Samairdotru()
   proxyengine.Cleanitup()
  if jSingle:
   proxyengine.Single()
   proxyengine.Cleanitup()
  if jTestproxy:
   proxyengine.LoadProxy()
   proxyengine.TestProxy()
  if doall:
   proxyengine.Samairdotru()
   proxyengine.Single() 
   proxyengine.LoadProxy()
   proxyengine.TestProxy()   
  
def main():
 print "\nPyProxy v.%s by %s - Proxy Hunter and Tester Opensource engine\nA high-level cross-protocol proxy-hunter\n" % (__version__, __author__)
 proxyengine = runengine()
 proxyengine.parseoption()
 proxyengine.run()
       
if __name__ == '__main__':
 main()
An exception has occurred, use %tb to see the full traceback.

SystemExit: 2
PyProxy v.09 by Gunslinger_ <yudha.gunslinger@gmail.com> - Proxy Hunter and Tester Opensource engine
A high-level cross-protocol proxy-hunter


Usage: -c [options]

-c: error: no such option: -f
To exit: use 'exit', 'quit', or Ctrl-D.
In []:
py
Дальше надо бы разобрать этот код на части, надо будет разобрать каждый блок, да и с импортом модулей наверняка возникнут проблемы. Чтобы не засорять эту страничку, она и так будет громоздкой, вспомним про то, что у нас работает сервер Tornado (8888 port), и мы можем подключиться к нему и из консоли.
In [1]:
%lsmagic
Available line magics:
%alias  %alias_magic  %autocall  %automagic  %bookmark  %cd  %clear  %colors  %config  %connect_info  %debug  %dhist  %dirs  %doctest_mode  %ed  %edit  %env  %gui  %hist  %history  %install_default_config  %install_ext  %install_profiles  %killbgscripts  %less  %load  %load_ext  %loadpy  %logoff  %logon  %logstart  %logstate  %logstop  %lsmagic  %macro  %magic  %man  %more  %notebook  %page  %pastebin  %pdb  %pdef  %pdoc  %pfile  %pinfo  %pinfo2  %popd  %pprint  %precision  %profile  %prun  %psearch  %psource  %pushd  %pwd  %pycat  %pylab  %qtconsole  %quickref  %recall  %rehashx  %reload_ext  %rep  %rerun  %reset  %reset_selective  %run  %save  %sc  %store  %sx  %system  %tb  %time  %timeit  %unalias  %unload_ext  %who  %who_ls  %whos  %xdel  %xmode

Available cell magics:
%%!  %%bash  %%capture  %%file  %%perl  %%prun  %%ruby  %%script  %%sh  %%sx  %%system  %%timeit

Automagic is ON, % prefix IS NOT needed for line magics.

In [2]:
%qtconsole
# команда не прошла, наверное она десь просто не установлена
In []:
python pyproxy.py -h

PyProxy v.09 by Gunslinger_ <yudha.gunslinger@gmail.com> - Proxy Hunter and Tester Opensource engine
A high-level cross-protocol proxy-hunter

Usage: pyproxy.py [options]

Options:
  -h, --help            show this help message and exit
  -s, --samair          just use samair.ru to hunt proxies
  -l, --sitelist        use all site in the list
  -t, --test            test all proxy !
  -a, --all             do all !
  -v, --version         print current proxy hunter version
  -d, --debug           debug program for more talkable & every proxy will be
                        print to screen
  -o FILE, --outputfile=FILE
                        output proxy will be print
                        [default : proxylist.txt]
  -i FILE, --inputfile=FILE
                        input proxy will be checked
                        [default : proxylist.txt]
  -g FILE, --outputgood=FILE
                        output all good proxy will be saved
                        [default : goodproxy.txt]
  -c NUMBER, --timeout=NUMBER
                        timeout connections being program run
                        [default : 30]

  Example :
    pyproxy.py -s                   | Gather proxy with samair.ru
    pyproxy.py -l                   | Gather proxy in the url list
    pyproxy.py -t proxylist.txt     | Test proxy inside proxylist.txt
    pyproxy.py -a                   | Do all
    pyproxy.py -v                   | Print current version
Итак, я сейчас думаю о том, как работает эта пограмма, но передо мной инфраструктурные задачи: как подключить параллельно консоль, как потом лучше применять дебаггер и какой... Все эти навыки необходимы, это именно должны быть "навыки", как навыки вождения (автомобиля): когда думаешь, куда повернуть, некогда думать о том, в какой последовательности нажимать на педали и переключать передачи...
Поэтому закончим этот пост нажеждой - вот внизу то, что мне удалось получить от запуска модуля, он собрал ...неожиданно, списки прокси с некоторых сайтов. Отлично! Значит нужны будут инструменты, чтобы изучить и подправить этот код.
In []:
python pyproxy.py -l

PyProxy v.09 by Gunslinger_ <yudha.gunslinger@gmail.com> - Proxy Hunter and Tester Opensource engine
A high-level cross-protocol proxy-hunter

[*] Parse proxy from www.proxy-list.net/anonymous-proxy-lists.shtml
[*] 0 Proxies receieved from : www.proxy-list.net/anonymous-proxy-lists.shtml                                                   
[*] Parse proxy from www.digitalcybersoft.com/ProxyList/fresh-proxy-list.shtml
[*] 0 Proxies receieved from : www.digitalcybersoft.com/ProxyList/fresh-proxy-list.shtml                                                   
[*] Parse proxy from www.1proxyfree.com/
[*] 0 Proxies receieved from : www.1proxyfree.com/                                                   
[*] Parse proxy from www.proxylists.net/http_highanon.txt
[*] 100 Proxies receieved from : www.proxylists.net/http_highanon.txt                                                   
[*] Parse proxy from www.atomintersoft.com/products/alive-proxy/socks5-list/
[*] 15 Proxies receieved from : www.atomintersoft.com/products/alive-proxy/socks5-list/                                                   
[*] Parse proxy from www.proxylist.net/
[*] 0 Proxies receieved from : www.proxylist.net/                                                   
[*] Parse proxy from aliveproxy.com/high-anonymity-proxy-list/
[*] 10 Proxies receieved from : aliveproxy.com/high-anonymity-proxy-list/                                                   
[*] Parse proxy from spys.ru/en/
[*] 0 Proxies receieved from : spys.ru/en/                                                   
[*] Parse proxy from spys.ru/en/http-proxy-list/
[*] 0 Proxies receieved from : spys.ru/en/http-proxy-list/                                                   
[*] Parse proxy from atomintersoft.com/free_proxy_list
[*] 15 Proxies receieved from : atomintersoft.com/free_proxy_list                                                   
[*] Parse proxy from aliveproxy.com/proxy-list/proxies.aspx/Indonesia-id
[*] 10 Proxies receieved from : aliveproxy.com/proxy-list/proxies.aspx/Indonesia-id                                                   
[*] Parse proxy from tinnhanh.ipvnn.com/free-proxy/Indonesia_Proxy_List.ipvnn
[*] 0 Proxies receieved from : tinnhanh.ipvnn.com/free-proxy/Indonesia_Proxy_List.ipvnn                                                   

[*] 128 Unique proxy list has been sorted . saved in uniqueproxylist.txt
Как здесь и написано, программа сама создала файл uniqueproxylist.txt ...и убрала там дубли!!! Так что у этого поста будет продолжение, но сначала я закреплю навыки работы с дебаггерами.


Посты чуть ниже также могут вас заинтересовать

Комментариев нет:

Отправить комментарий