引用元: ・少し前にお前らに相談したエロサイトのスクレイピングプログラムできたぞ!
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from time import sleep
import csv
HEADER = [‘動画タイトル’,’再生時間’, ‘画像URL’, ‘動画ページURL’,’画質’, ‘閲覧数’, ‘投稿日’, ‘高評価率’]
browser = webdriver.Chrome(ChromeDriverManager().install())
#検索キーワードを入力
keyword = ‘おっぱい’
#検索開始ページ
page = 1
#検索するページ数
max_page = 100
def get_url(keyword,page=1):
url = ‘https://www.tokyomotion.net/search?search_query={}&search_type=videos&type=public&page={}’.format(keyword,page)
return url
if __name__ == “__main__”:
while page <= 100:
url = get_url(keyword,page)
browser.get(url)
contents = browser.find_elements_by_class_name(‘col-sm-4.col-md-3.col-lg-3’)
with open(‘motion.csv’, ‘a’, encoding=’utf-8′) as f:
writer = csv.writer(f)
writer.writerow(HEADER)
for content in contents:
title = content.find_element_by_class_name(‘video-title’)
time = content.find_element_by_class_name(‘duration’)
post_date = content.find_element_by_class_name(‘video-added’)
views = content.find_element_by_class_name(‘video-views’)
heart = content.find_element_by_class_name(‘video-rating’)
heart_count = heart.find_element_by_tag_name(‘b’)
image = content.find_element_by_class_name(‘img-responsive ‘).get_attribute(‘src’)
link = content.find_element_by_class_name(‘thumb-popu’).get_attribute(‘href’)
try:
content.find_element_by_class_name(‘hd-text-icon’)
quality = ‘HD’
except:
quality = ‘SD’
title_csv = title.text
time_csv = time.text
image_csv = image
link_csv = link
quality_csv = quality
pv_csv = views.text
date_csv = post_date.text
evaluation_csv = heart_count.text
row = [title_csv,time_csv,image_csv,link_csv,quality_csv,pv_csv,date_csv,evaluation_csv]
writer.writerow(row)
sleep(10)
page += 1
これ改行多すぎ規制にならないんだ
↑を参考にしてExcelでサムネイル込みで一気に閲覧できるからストレスフリーだぜ!
※画像読み込みまでかなりじかんかかるけど
ストレスだらけじゃん
ページめくりの必要がないからストレスフリーなの!
我慢したよ!これから抜く!
https://imgur.com/OQQMmNv.jpg
こういう意見マジ助かる
調べながらやってると無駄な事に気付かないもんだな…。
修正してくる
そうわよ!
そうすると同じディレクトリにCSVファイルが作られるからそのファイルをExcelとかで開くとシコりやすい
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from time import sleep
import csv
HEADER = [‘動画タイトル’,’再生時間’, ‘画像URL’, ‘動画ページURL’,’画質’, ‘閲覧数’, ‘投稿日’, ‘高評価率’]
browser = webdriver.Chrome(ChromeDriverManager().install())
#検索キーワードを入力
keyword = ‘おっぱい’
#検索開始ページ
page = 1
#検索するページ数
max_page = 100
def get_url(keyword,page=1):
url = ‘https://www.tokyomotion.net/search?search_query={}&search_type=videos&type=public&page={}’.format(keyword,page)
return url
if __name__ == “__main__”:
while page <= 3:
url = get_url(keyword,page)
browser.get(url)
contents = browser.find_elements_by_class_name(‘col-sm-4.col-md-3.col-lg-3’)
with open(‘motion1.csv’, ‘a’, encoding=’utf-8′) as f:
writer = csv.writer(f)
writer.writerow(HEADER)
for content in contents:
title = content.find_element_by_class_name(‘video-title’).text
time = content.find_element_by_class_name(‘duration’).text
date = content.find_element_by_class_name(‘video-added’).text
views = content.find_element_by_class_name(‘video-views’).text
heart = content.find_element_by_class_name(‘video-rating’)
heart_count = heart.find_element_by_tag_name(‘b’).text
image = content.find_element_by_class_name(‘img-responsive ‘).get_attribute(‘src’)
link = content.find_element_by_class_name(‘thumb-popu’).get_attribute(‘href’)
try:
content.find_element_by_class_name(‘hd-text-icon’)
quality = ‘HD’
except:
quality = ‘SD’
row = [title,time,image,link,quality,views,date,heart_count]
writer.writerow(row)
sleep(10)
page += 1
修正したわよ!
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from time import sleep
import csv
HEADER = [‘動画タイトル’,’再生時間’, ‘画像URL’, ‘動画ページURL’,’画質’, ‘閲覧数’, ‘投稿日’, ‘高評価率’]
browser = webdriver.Chrome(ChromeDriverManager().install())
#検索キーワードを入力
keyword = ‘おっぱい’
#検索開始ページ
page = 1
#検索するページ数
max_page = 1
#保存するCSVファイル名
save_csv = ‘motion8.csv’
def get_url(keyword,page=1):
url = ‘https://www.tokyomotion.net/search?search_query={}&search_type=videos&type=public&page={}’.format(keyword,page)
return url
if __name__ == “__main__”:
while page <= max_page:
url = get_url(keyword,page)
browser.get(url)
contents = browser.find_elements_by_class_name(‘col-sm-4.col-md-3.col-lg-3’)
with open(save_csv, ‘a’, encoding=’utf-8′) as f:
writer = csv.writer(f)
writer.writerow(HEADER)
for content in contents:
title = content.find_element_by_class_name(‘video-title’).text
time = content.find_element_by_class_name(‘duration’).text
date = content.find_element_by_class_name(‘video-added’).text
views = content.find_element_by_class_name(‘video-views’).text
heart = content.find_element_by_class_name(‘video-rating’)
heart_count = heart.find_element_by_tag_name(‘b’).text
image = content.find_element_by_class_name(‘img-responsive ‘).get_attribute(‘src’)
link = content.find_element_by_class_name(‘thumb-popu’).get_attribute(‘href’)
try:
content.find_element_by_class_name(‘hd-text-icon’)
quality = ‘HD’
except:
quality = ‘SD’
row = [title,time,image,link,quality,views,date,heart_count]
writer.writerow(row)
sleep(10)
page += 1