LambdaとDynamoDBでオタ活

2021-01-02

作ったもの

概要

日向坂の公式ブログからスクレイピングした画像のパスを描画するだけのもの

構成

  • 関数単位のサーバレス構成(大袈裟にいうと)
  • Lambdaが定期に動いてDynamoDBに画像のパスを入れる
  • API Gateway経由でlambdaを動かしDynamoDBからデータを取得する
  • Vue Cliで作成したプロジェクトをnetlifyにホスティング
  • すべて無料枠の範疇なのでお金取られない😄

所感

  • 無料はありがたい
  • Lambda、モジュールを動かすのがだるい
  • Lambdaってバージョン管理どうしてんだろ、、、
  • 使ってる技術は初めてのものばかりだが、無料で動かしみたいという動機だけなので、全然理解はできてない。
  • なかでもDynamoのテーブル設計がむずくて沼った。結局勉強をやめて、とりあえず動けの方向性にシフトした

Lambdaのソースコード

Pythonでやってみた

今までのブログの画像をdynamoにいれる(ローカルで実行)

#全ての画像をDynamoDBに入れる

from selenium import webdriver
from bs4 import BeautifulSoup
import requests
import boto3
from boto3.dynamodb.conditions import Key, Attr
import os

def main():
    set_env()
    members = ["2","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","23","24"]
    years = ["2016","2017","2018","2019","2020"]
    months = ["01","02","03","04","05","06","07","08","09","10","11","12"]

    for member in members:
        for year in years:
            for month in months:
                img_url_list = getPhotosPath(member,year,month)
                insertData(img_url_list,member,year,month)

def getPhotosPath(member,year,month):
    driver = webdriver.Chrome()
    url = "https://www.hinatazaka46.com/s/official/diary/member/list?ima=0000&ct=" + member +"&dy=" + year + month
    driver.get(url)

    html = requests.get(url)
    bs = BeautifulSoup(html.text, "html.parser")
    maincontents = bs.select('.l-maincontents--blog')
    images = maincontents[0].find_all("img")

    img_url_list = []
    for image in images:
        if image.get('src') != '':
            img_url_list.append(image.get('src'))
    driver.close()

    return img_url_list

def insertData(img_url_list,member,year,month):
    # overwrite_by_pkeys→パーテションキーの重複があった場合上書きする
    with get_table().batch_writer(overwrite_by_pkeys=['first', 'second']) as batch:
        for img in img_url_list:
            # noneの値も存在したから判定
            if img is not None:
                batch.put_item(
                    Item={
                        'first': img,
                        'second': img,
                        'member': member,
                        'month': year+month
                    }
                )

def set_env():
    os.environ['AWS_ACCESS_KEY_ID'] = '******'
    os.environ['AWS_SECRET_ACCESS_KEY'] = '*****'

def get_table():
    dynamodb = boto3.resource('dynamodb')
    table = dynamodb.Table('hinata')
    return table

if __name__ == '__main__':
    main()

画像パスをdynamoにいれる定期実行処理

今んところ1日3回実行してます

from selenium import webdriver
from bs4 import BeautifulSoup
import requests
import boto3
from boto3.dynamodb.conditions import Key, Attr
import os
import datetime
import pytz

def lambda_handler(event, context):
    members = ["2","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","23","24"]

    now = datetime.datetime.now(pytz.timezone('Asia/Tokyo'))
    for member in members:
        #月は2桁の0埋めする         
        img_url_list = getPhotosPath(member,str(now.year),str(f'{now.month:02}'))
        insertData(img_url_list,member,str(now.year),str(f'{now.month:02}'))

def getPhotosPath(member,year,month):
    options = webdriver.ChromeOptions()
    options.binary_location = "/opt/headless/python/bin/headless-chromium"
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--single-process")

    driver = webdriver.Chrome(
        executable_path="/opt/headless/python/bin/chromedriver",
        options=options
    )

    url = "https://www.hinatazaka46.com/s/official/diary/member/list?ima=0000&ct=" + member +"&dy=" + year + month
    driver.get(url)

    html = requests.get(url)
    bs = BeautifulSoup(html.text, "html.parser")
    maincontents = bs.select('.l-maincontents--blog')
    images = maincontents[0].find_all("img")

    img_url_list = []
    for image in images:
        if image.get('src') != '':
            img_url_list.append(image.get('src'))
    driver.close()

    return img_url_list

def insertData(img_url_list,member,year,month):
    with get_table().batch_writer(overwrite_by_pkeys=['first', 'second']) as batch:
        for img in img_url_list:
            if img is not None:
                batch.put_item(
                    Item={
                        'first': img,
                        'second': img,
                        'member': member,
                        'month': year+month
                    }
                )

def get_table():
    dynamodb = boto3.resource('dynamodb')
    table = dynamodb.Table('hinata')
    return table

月別の画像パスを取得

ex) https://hitazanaka.site/20/2020/06

import boto3
from boto3.dynamodb.conditions import Key, Attr
import os
import json

def lambda_handler(event, context):

    data = queryByMonthAndMember(event['pathParameters']['member'],event['pathParameters']['year'],event['pathParameters']['month'])
    return {
        'statusCode': 200 ,
        'headers': {
            'Access-Control-Allow-Headers': 'Content-Type',
            'Access-Control-Allow-Origin': '*',
            'Access-Control-Allow-Methods': 'GET'
        },
        'body': json.dumps(data)
    }

def queryByMonthAndMember(member, year, month):
    response = get_table().query(
        IndexName="member-month-index",
        KeyConditionExpression=Key('member').eq(member) & Key('month').eq(year + month),
    )
    return [d.get('first') for d in response['Items']]

def get_table():
    dynamodb = boto3.resource('dynamodb')
    table = dynamodb.Table('hinata')
    return table

月別の画像数を表示

ex) https://hitazanaka.site/members/20

import boto3
from boto3.dynamodb.conditions import Key, Attr
from itertools import groupby
import json

def lambda_handler(event, context):
    return {
        'statusCode': 200 ,
        'headers': {
            'Access-Control-Allow-Headers': 'Content-Type',
            'Access-Control-Allow-Origin': '*',
            'Access-Control-Allow-Methods': 'GET'
        },
        'body': json.dumps(countByMonthAndMember(event['pathParameters']['member']))
    }

def countByMonthAndMember(member):
    response = get_table().query(
        IndexName="member-month-index",
        KeyConditionExpression=Key('member').eq(member),
    )
    # まずはメンバー別で取得する
    items = sorted(response["Items"], key=lambda x:x['month'])
    grouped = groupby(items, key=lambda x:x['month'])

    dict = {}
    for key, items in grouped:
        dict.setdefault(key, len(list(items)))

    return dict

def get_table():
    dynamodb = boto3.resource('dynamodb')
    table = dynamodb.Table('hinata')
    return table

DynamoDB

DynamoDBのドキュメントを見てると、テーブルを1つに保つのがいい設定とされてます。
設計が難しく、あまり飲み込めなかった。

DynamoDB 用の NoSQL 設計

項目の中身

いつかいいね機能をつけるかもしれないので、1テーブルで多対多に対応できるようfirstとsecondを作った。
DynamoDBで多対多のテーブル設計

インデックス

  • メンバーと年月で検索するように設定