Welcome 微信登录
编程资源 图片资源库 蚂蚁家优选 PDF转换器

首页 / 操作系统 / Linux / Python学习笔记-简易抓取网页

主要是通过urllib2获取预先指定好的地址页面,通过BeautifulSoup来解析界面元素,找到href标签,并将相关的数据存入数据库,以方便后面取出继续抓取。整个抓取和解析也是基于多线程与队列来控制的。做的比较简单与粗糙,后续深入可以改进。import DBHelper
import CodeHelper
import urllib2
from bs4 import BeautifulSoup
import threading as thread
import Queue
import timeclass Resource:
   
    def __init__(self, url, text, content, status):
        self._url = url
        self._text = text
        self._content = content
        self._status = status
       
    def insert(self):
        dbHelper = DBHelper.DBHelper()
        sql = "select * from resource where url=%s"
        data = dbHelper.read(sql, [self._url])
        if data is not None :
            return
        sql = "insert into resource(url,text,content,status) values(%s,%s,%s,%s)"
        print "url: %s content: %s status: %s" %(self._url, self._text, self._content, self._status)
        dbHelper.execute(sql, [self._url, self._text, self._content, self._status]);
        dbHelper.commint()
        dbHelper.close()
       
    def updateStatus(self):
        dbHelper = DBHelper.DBHelper()
        sql = "update resource set status=%s where url=%s"
        dbHelper.execute(sql, [self._status, self._url]);
        dbHelper.commint()
        dbHelper.close()
       
    def updateContentAndStatus(self):
        dbHelper = DBHelper.DBHelper()
        sql = "update resource set content=%s,status=%s where url=%s"
        dbHelper.execute(sql, [self._content, self._status, self._url]);
        dbHelper.commint()
        dbHelper.close()
       
    def readListByStatus(self):
        dbHelper = DBHelper.DBHelper()
        sql = "select * from resource where status=%s"
        return dbHelper.readList(sql, [self._status]);
       
    def readList(self):
        dbHelper = DBHelper.DBHelper()
        return dbHelper.readList("select * from resource");
       
class ResourceThread(thread.Thread):
   
    def __init__(self, task_queue):
        thread.Thread.__init__(self)
        self._task_queue = task_queue
        self.setDaemon(True)
        self.start()
   
    def run(self):
        print "current thread name %s" %thread.currentThread().name
        while True :
            try :
                func, args = self._task_queue.get(block = False)
                func(args)
                self._task_queue.task_done()
            except Exception,e :
                print str(e)
                break
           
class ResourceManager:
   
    def __init__(self, taskNum = 10, threadNum = 2) :
        self._task_queue = Queue.Queue()
        self._threads = []
        self.__init__task_queue__(taskNum)
        self.__init__thread_pool(threadNum)
       
    def __init__task_queue__(self, taskNum) :
        for i in range(taskNum) :
            print "this is %s task" %i
            self.add_task(do_task, i)
       
    def __init__thread_pool(self, threadNum) :
        for i in range(threadNum) :
            print "threadNum %s" %i
            resourceThread = ResourceThread(self._task_queue)
            self._threads.append(resourceThread)
           
    def add_task(self, func, *args) :
        self._task_queue.put((func, args))
   
    def check_queue(self):
        return self._task_queue.qsize()
   
    def wait_for_complete(self) :
        for thread_item in self._threads :
            if thread_item.isAlive() :
                thread_item.join()
   
def do_task(args):
    print "this task args %s" %args
    resource = Resource(None, None, None, 0)
    data = resource.readListByStatus()
    print "read status 0 data is %s" %data
    if data is None :
        return
    for item in data :
        url = item[1]
        if url is None or url.find("http://") == -1 :
            continue
        content = urllib2.urlopen(url).read()
        html = BeautifulSoup(content)
        fetch_resource = Resource(url, None, str(html.find("body"))[0:9999], 1)
        fetch_resource.updateContentAndStatus()
        aLinks = html.find_all("a")
        print "aLinks %s" %aLinks
        for aLink in aLinks :
            href = aLink.get("href")
            a_text = CodeHelper.encodeContent(aLink.get_text())
            print "href %s text %s" %(href, a_text)
            subResource = Resource(href, a_text, "", 0)
            subResource.insert()
           
def execute():
    urls = ["http://www.kuwo.cn", "http://www.1ting.com/", "http://www.kugou.com/", "http://y.**.com/"]
    for url in urls :
        resource = Resource(url, None, 0)
        resource.insert()
   
    start = time.time()
    resource_manager =  ResourceManager(20, 4)
    resource_manager.wait_for_complete()
    end = time.time()
    print "cost all time: %s" % (end-start)if __name__ == "__main__":
    execute()《Python核心编程 第二版》.(Wesley J. Chun ).[高清PDF中文版] http://www.linuxidc.com/Linux/2013-06/85425.htm《Python开发技术详解》.( 周伟,宗杰).[高清PDF扫描版+随书视频+代码] http://www.linuxidc.com/Linux/2013-11/92693.htmPython脚本获取Linux系统信息 http://www.linuxidc.com/Linux/2013-08/88531.htm在Ubuntu下用Python搭建桌面算法交易研究环境 http://www.linuxidc.com/Linux/2013-11/92534.htmPython 的详细介绍:请点这里
Python 的下载地址:请点这里本文永久更新链接地址:http://www.linuxidc.com/Linux/2014-06/103212.htm