「Python筆記」python爬蟲簡單實戰

「Python筆記」python爬蟲簡單實戰

資源介紹參數
資源類別: Python
如遇問題: 聯繫客服/留言反饋
釋放雙眼,帶上耳機,聽聽看~!
使用requests+BeautifulSoup+sqlalchemy+pymysql爬取貓眼TOP100並寫入數據庫和txt文檔做題用到爬蟲正好複習一下一些東西,爬取貓眼TOP100電影,並用sqlalchemy寫入數據庫,並寫入txt文檔。

使用requests+BeautifulSoup+sqlalchemy+pymysql爬取貓眼TOP100並寫入數據庫和txt文檔

做題用到爬蟲正好複習一下一些東西,爬取貓眼TOP100電影,並用sqlalchemy寫入數據庫,並寫入txt文檔

先做好數據庫連接的配置

from sqlalchemy import create_engine,Column,Integer,String,Text
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker

HOSTNAME = '127.0.0.1'
DATABASE = 'movies'
PORT = '3306'
USERNAME = 'root'
PASSWORD = 'root'
DB_URL = "mysql+pymysql://{username}:{password}@{host}:{port}/{database}?charset=utf8mb4".format(username=USERNAME,password=PASSWORD,host=HOSTNAME, port=PORT,database=DATABASE)
engine = create_engine(DB_URL)
conn = engine.connect()
Base = declarative_base()
Session = sessionmaker(engine)()

創建數據表

class Movies(Base):
    __tablename__ = 'movies'
    index = Column(Integer,primary_key=True,autoincrement=True)
    src = Column(Text,nullable=False)
    name = Column(String(50),nullable=False)
    actor = Column(String(50),nullable=False)
    time = Column(String(50),nullable=False)
    score = Column(String(50),nullable=False)

Base.metadata.create_all(engine)

alter = 'alter table movies convert to character set utf8mb4;'
conn.execute(alter)

要注意執行修改字符集語句,否賊無法寫入

分析結構

from bs4 import BeautifulSoup
import requests
import re

def main(index):
    req = requests.get(url.format(str(index)))
    soup = BeautifulSoup(req.text, "html5lib")
    for item in soup.select('dd'):
        pass

分析結構可以看出,每一部電影都寫在一個<dd>標籤中,只要獲取到這個標籤,再向下搜尋就能得到想要的數據

爬取數據

def get_index(item):
    index = item.select_one("i").text
    return index

def get_src(item):
    img_src = item.select("img")[1]
    template = re.compile('data-src="(.*?)"')
    img_src = template.findall(str(img_src))[0]
    return img_src

def get_name(item):
    name = item.select(".name")[0].text
    return name

def get_actor(item):
    actor = item.select(".star")[0].text.split(':')[1]
    return actor

def get_time(item):
    time = item.select(".releasetime")[0].text.split(':')[1]
    return time

def get_score(item):
    score = item.select('.integer')[0].text + item.select('.fraction')[0].text
    return score

獲取需要的信息,因為srcdata-scr中,所以這裡我用正則去獲取。

構造dict

def get_dict(item):
    index = int(get_index(item))
    src = get_src(item)
    name = get_name(item)
    actor = get_actor(item)
    time = get_time(item)
    score = get_score(item)
    movies_dict = {'index': index, 'src': src, 'name': name, 'actor': actor, 'time': time, 'score': score}
    return movies_dict

將爬取的數據整理成dict(寫完後覺得這步沒有必要)

寫入txt

def write_file(content):
    content = json.dumps(content,ensure_ascii=False)
    with open('result.txt','a') as f:
        f.write(content +'n')

這裡需要將dictjson.dumps方法編碼成json字符串,否則無法寫入

寫入數據庫

def write_to_mysql(content):
    src = content['src']
    name = content['name']
    actor = content['actor'].split('n')[0]
    time = content['time']
    score = content['score']
    data = Movies(src = src,name=name,actor=actor,time=time,score=score)
    Session.add(data)
    Session.commit()

在主函式中調用

def main(index):
    req = requests.get(url.format(str(index)))
    soup = BeautifulSoup(req.text, "html5lib")
    for item in soup.select('dd'):
        movies_dict = get_dict(item)
        write_to_mysql(movies_dict)
        write_file(movies_dict)

爬取所有頁面

for i in range(10):
    main(i*10)

完整代碼

from bs4 import BeautifulSoup
from sqlalchemy import create_engine,Column,Integer,String,Text
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
import requests
import re
import json

HOSTNAME = '127.0.0.1'
DATABASE = 'movies'
PORT = '3306'
USERNAME = 'root'
PASSWORD = 'root'
DB_URL = "mysql+pymysql://{username}:{password}@{host}:{port}/{database}?charset=utf8mb4".format(username=USERNAME,password=PASSWORD,host=HOSTNAME, port=PORT,database=DATABASE)
engine = create_engine(DB_URL)
conn = engine.connect()

Base = declarative_base()
Session = sessionmaker(engine)()

class Movies(Base):
    __tablename__ = 'movies'
    index = Column(Integer,primary_key=True,autoincrement=True)
    src = Column(Text,nullable=False)
    name = Column(String(50),nullable=False)
    actor = Column(String(50),nullable=False)
    time = Column(String(50),nullable=False)
    score = Column(String(50),nullable=False)

Base.metadata.create_all(engine)

alter = 'alter table movies convert to character set utf8mb4;'
conn.execute(alter)

def get_index(item):
    index = item.select_one("i").text
    return index

def get_src(item):
    img_src = item.select("img")[1]
    template = re.compile('data-src="(.*?)"')
    img_src = template.findall(str(img_src))[0]
    return img_src

def get_name(item):
    name = item.select(".name")[0].text
    return name

def get_actor(item):
    actor = item.select(".star")[0].text.split(':')[1]
    return actor

def get_time(item):
    time = item.select(".releasetime")[0].text.split(':')[1]
    return time

def get_score(item):
    score = item.select('.integer')[0].text + item.select('.fraction')[0].text
    return score

def get_dict(item):
    index = int(get_index(item))
    src = get_src(item)
    name = get_name(item)
    actor = get_actor(item)
    time = get_time(item)
    score = get_score(item)
    movies_dict = {'index': index, 'src': src, 'name': name, 'actor': actor, 'time': time, 'score': score}
    return movies_dict

def write_file(content):
    content = json.dumps(content,ensure_ascii=False)
    with open('result.txt','a') as f:
        f.write(content +'n')

def write_to_mysql(content):
    src = content['src']
    name = content['name']
    actor = content['actor'].split('n')[0]
    time = content['time']
    score = content['score']
    data = Movies(src = src,name=name,actor=actor,time=time,score=score)
    Session.add(data)
    Session.commit()

def main(index):
    req = requests.get(url.format(str(index)))
    soup = BeautifulSoup(req.text, "html5lib")
    for item in soup.select('dd'):
        movies_dict = get_dict(item)
        write_to_mysql(movies_dict)
        write_file(movies_dict)

url = 'https://maoyan.com/board/4?offset={}'

for i in range(10):
    main(i*10)

使用selenium爬取空間說說

配置驅動,模擬登入

from selenium import webdriver
import time

qq = input("請輸入qq號")

ss_url ='https://user.qzone.qq.com/{}/311'.format(qq)

driver = webdriver.Chrome("chromedriver.exe")
driver.maximize_window()

driver.get(ss_url)
driver.switch_to.frame('login_frame')
driver.find_element_by_class_name('face').click()

next_page='page'
page=1

抓取說說

while next_page:
    time.sleep(2)
    # driver.implicitly_wait(100)
    driver.switch_to.frame('app_canvas_frame')
    content = driver.find_elements_by_css_selector('.content')
    stime = driver.find_elements_by_css_selector('.c_tx.c_tx3.goDetail')
    print('正在抓取第%s頁'%page)
    for con, sti in zip(content, stime):
        data = {
            'time': sti.text,
            'shuos': con.text
        }
        print(data)
    time.sleep(1)

使用zip構建元組來遍歷
使用time.sleep()來等待頁面加載(因為隱式等待和顯示等待沒搞明白,所以用強制等待。。。。)

翻頁

    next_page = driver.find_element_by_link_text('下一頁')
    page = page+1
    next_page.click()
    driver.switch_to.parent_frame()

翻頁後要使用driver.switch_to.parent_frame()找到上策frame,否則無法定位標籤

完整代碼

from selenium import webdriver
import time

qq = input("請輸入qq號")

ss_url ='https://user.qzone.qq.com/{}/311'.format(qq)

driver = webdriver.Chrome("chromedriver.exe")
driver.maximize_window()

driver.get(ss_url)
driver.switch_to.frame('login_frame')
driver.find_element_by_class_name('face').click()
next_page='page'
page=1
while next_page:
    time.sleep(2)
    # driver.implicitly_wait(100)
    driver.switch_to.frame('app_canvas_frame')
    content = driver.find_elements_by_css_selector('.content')
    stime = driver.find_elements_by_css_selector('.c_tx.c_tx3.goDetail')
    print('正在抓取第%s頁'%page)
    for con, sti in zip(content, stime):
        data = {
            'time': sti.text,
            'shuos': con.text
        }
        print(data)
    time.sleep(1)
    next_page = driver.find_element_by_link_text('下一頁')
    page = page+1
    next_page.click()
    driver.switch_to.parent_frame()
聲明:本文為原創作品,版權歸作者所有。未經許可,不得轉載或用於任何商業用途。如若本站內容侵犯了原著者的合法權益,可聯繫我們進行處理。

給TA打賞
共{{data.count}}人
人已打賞
0 條回復 A文章作者 M管理員
    暫無討論,說說你的看法吧