日向坂の公式ブログからスクレイピングした画像のパスを描画するだけのもの
Pythonでやってみた
#全ての画像をDynamoDBに入れる
from selenium import webdriver
from bs4 import BeautifulSoup
import requests
import boto3
from boto3.dynamodb.conditions import Key, Attr
import os
def main():
set_env()
members = ["2","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","23","24"]
years = ["2016","2017","2018","2019","2020"]
months = ["01","02","03","04","05","06","07","08","09","10","11","12"]
for member in members:
for year in years:
for month in months:
img_url_list = getPhotosPath(member,year,month)
insertData(img_url_list,member,year,month)
def getPhotosPath(member,year,month):
driver = webdriver.Chrome()
url = "https://www.hinatazaka46.com/s/official/diary/member/list?ima=0000&ct=" + member +"&dy=" + year + month
driver.get(url)
html = requests.get(url)
bs = BeautifulSoup(html.text, "html.parser")
maincontents = bs.select('.l-maincontents--blog')
images = maincontents[0].find_all("img")
img_url_list = []
for image in images:
if image.get('src') != '':
img_url_list.append(image.get('src'))
driver.close()
return img_url_list
def insertData(img_url_list,member,year,month):
# overwrite_by_pkeys→パーテションキーの重複があった場合上書きする
with get_table().batch_writer(overwrite_by_pkeys=['first', 'second']) as batch:
for img in img_url_list:
# noneの値も存在したから判定
if img is not None:
batch.put_item(
Item={
'first': img,
'second': img,
'member': member,
'month': year+month
}
)
def set_env():
os.environ['AWS_ACCESS_KEY_ID'] = '******'
os.environ['AWS_SECRET_ACCESS_KEY'] = '*****'
def get_table():
dynamodb = boto3.resource('dynamodb')
table = dynamodb.Table('hinata')
return table
if __name__ == '__main__':
main()
今んところ1日3回実行してます
from selenium import webdriver
from bs4 import BeautifulSoup
import requests
import boto3
from boto3.dynamodb.conditions import Key, Attr
import os
import datetime
import pytz
def lambda_handler(event, context):
members = ["2","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","23","24"]
now = datetime.datetime.now(pytz.timezone('Asia/Tokyo'))
for member in members:
#月は2桁の0埋めする
img_url_list = getPhotosPath(member,str(now.year),str(f'{now.month:02}'))
insertData(img_url_list,member,str(now.year),str(f'{now.month:02}'))
def getPhotosPath(member,year,month):
options = webdriver.ChromeOptions()
options.binary_location = "/opt/headless/python/bin/headless-chromium"
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--single-process")
driver = webdriver.Chrome(
executable_path="/opt/headless/python/bin/chromedriver",
options=options
)
url = "https://www.hinatazaka46.com/s/official/diary/member/list?ima=0000&ct=" + member +"&dy=" + year + month
driver.get(url)
html = requests.get(url)
bs = BeautifulSoup(html.text, "html.parser")
maincontents = bs.select('.l-maincontents--blog')
images = maincontents[0].find_all("img")
img_url_list = []
for image in images:
if image.get('src') != '':
img_url_list.append(image.get('src'))
driver.close()
return img_url_list
def insertData(img_url_list,member,year,month):
with get_table().batch_writer(overwrite_by_pkeys=['first', 'second']) as batch:
for img in img_url_list:
if img is not None:
batch.put_item(
Item={
'first': img,
'second': img,
'member': member,
'month': year+month
}
)
def get_table():
dynamodb = boto3.resource('dynamodb')
table = dynamodb.Table('hinata')
return table
ex) https://hitazanaka.site/20/2020/06
import boto3
from boto3.dynamodb.conditions import Key, Attr
import os
import json
def lambda_handler(event, context):
data = queryByMonthAndMember(event['pathParameters']['member'],event['pathParameters']['year'],event['pathParameters']['month'])
return {
'statusCode': 200 ,
'headers': {
'Access-Control-Allow-Headers': 'Content-Type',
'Access-Control-Allow-Origin': '*',
'Access-Control-Allow-Methods': 'GET'
},
'body': json.dumps(data)
}
def queryByMonthAndMember(member, year, month):
response = get_table().query(
IndexName="member-month-index",
KeyConditionExpression=Key('member').eq(member) & Key('month').eq(year + month),
)
return [d.get('first') for d in response['Items']]
def get_table():
dynamodb = boto3.resource('dynamodb')
table = dynamodb.Table('hinata')
return table
ex) https://hitazanaka.site/members/20
import boto3
from boto3.dynamodb.conditions import Key, Attr
from itertools import groupby
import json
def lambda_handler(event, context):
return {
'statusCode': 200 ,
'headers': {
'Access-Control-Allow-Headers': 'Content-Type',
'Access-Control-Allow-Origin': '*',
'Access-Control-Allow-Methods': 'GET'
},
'body': json.dumps(countByMonthAndMember(event['pathParameters']['member']))
}
def countByMonthAndMember(member):
response = get_table().query(
IndexName="member-month-index",
KeyConditionExpression=Key('member').eq(member),
)
# まずはメンバー別で取得する
items = sorted(response["Items"], key=lambda x:x['month'])
grouped = groupby(items, key=lambda x:x['month'])
dict = {}
for key, items in grouped:
dict.setdefault(key, len(list(items)))
return dict
def get_table():
dynamodb = boto3.resource('dynamodb')
table = dynamodb.Table('hinata')
return table
DynamoDBのドキュメントを見てると、テーブルを1つに保つのがいい設定とされてます。
設計が難しく、あまり飲み込めなかった。
いつかいいね機能をつけるかもしれないので、1テーブルで多対多に対応できるようfirstとsecondを作った。
DynamoDBで多対多のテーブル設計