__author__ = 'Saint'

import os

import urllib.request

import json

from html.parser import HTMLParser

# 從獲取的網(wǎng)頁內(nèi)容篩選圖片的內(nèi)容

class MyHtmlParser(HTMLParser):

    links = []

    def handle_starttag(self, tag, attrs):

        if tag == "img":

            if len(attrs) == 0:

                pass

            else:

                for name, value in attrs:

                    if name == "src":

                        self.links.append(value)

class Down(object):

    # 總的目錄

    img_path = "E:/saint"

    # 下載目錄

    dir = ''

    # 采集源地址

    collect_links = ["http://dy.163.com/v2/media/articlelist/T1374483113516-1", "http://dy.163.com/v2/media/articlelist/T1420776257254-1", "http://dy.163.com/v2/media/articlelist/T1376641060407-1"]

    img_links = "http://dy.163.com/v2/article"

    def handleCollect(self):

        for collect_link in self.collect_links:

            notice = "開始從[" + collect_link + "]采集圖片"

            print(notice)

            # 建立下載的目錄

            dir_name = collect_link.split("/")[-1]

            self.isDirExists(dir_name)

            dict = self.getListFromSubscribe(collect_link)

            if dict == False:

                print("數(shù)據(jù)采集失敗，是否繼續(xù)(y/n)")

                op = input();

                if op == "y":

                    os.system("cls")

                    pass

                elif op == "n":

                    print("停止采集")

                    break

                else:

                    os.system("cls")

                    print("非法輸入")

                    break

            else:

                for page in dict:

                    page_uri = self.img_links + "/" + page["tid"] + "/" + page["docid"]

                    self.getImgFromUri(page_uri)

                    print("是否繼續(xù)(y/n)")

                    new_op = input();

                    if new_op == "n":

                        os.system("cls")

                        print("采集完畢")

                        break

        print("OK")

    # 從訂閱源獲取目錄

    def getListFromSubscribe(self, uri):

        res = urllib.request.urlopen(uri)

        if res.code < 200 or res.code > 300:

            os.system("clear")

            return False

        else:

            result = res.read().decode("gbk") # 3.4版本的read()返回的是byte類型，需要decode()處理，選項是網(wǎng)頁編碼

            dict = json.loads(result)

            if dict['code'] != 1:

                print(dict['msg'])

                return False

            else:

                return dict['data']

    # 獲取本期訂閱的網(wǎng)頁，并從網(wǎng)頁中提取出來需要的圖片

    def getImgFromUri(self, uri):

        html_code = urllib.request.urlopen(uri).read().decode("gbk")

        hp = MyHtmlParser()

        hp.feed(html_code)

        hp.close()

        for link in hp.links: # hp.links 是圖片的下載地址的列表

            self.writeToDisk(link)

    # 檢查文件目錄是否存在，如果不存在，則創(chuàng)建目錄

    def isDirExists(self, dir_name):

        self.dir = self.img_path + dir_name

        isExists = os.path.exists(self.dir)

        if not isExists:

            os.makedirs(self.dir)

            return True

        else:

            return True

    # 下載文件，并且寫入磁盤

    def writeToDisk(self, url):

        os.chdir(self.dir)

        file = urllib.request.urlopen(url).read()

        file_name = url.split("/")[-1]

        open(file_name, "wb").write(file)

        return True

if __name__ == "__main__":

    down = Down()

    down.handleCollect()