taketea2018 が 2026年04月03日19時39分50秒に編集

初版

タイトルの変更

データサイエンス入門　第10回　その３　LSTMで小説執筆に挑戦しましょう

タグの変更

データサイエンス

Python

LSTM

小説執筆

記事種類の変更

セットアップや使用方法

本文の変更

# データサイエンス入門　 AIプログラミングで学ぶデータサイエンス ## 第10回　LSTMで小説を執筆しよう　その３　LSTMで小説執筆に挑戦しましょう ## 〇LSTMによる小説執筆について LSTMにより文章を学習させることはできたでしょうか。今回は学習結果を使って小説の執筆に挑戦しましょう。本連載は人工知能AIを扱うのに適しているPythonを言語として使用し、人工知能AIライブラリであるKerasを利用します。使用するPythonのバージョンは 3.10.12です。またKerasのバージョンは 3.5.0です。前回元になる文章sakuhin_all.txtを読み込み、LSTMにより学習しました。読み込んだ文章は4文字ごとに切り出した文字列とその次に来る文字を一組とします。そして、これらの文字列を配列に加工します。加工した配列をKerasに用意されているLSTMを利用して、学習します。学習結果はsakuhin_all.hdf.kerasに保存してあります。そして学習結果を利用して、小説執筆に挑戦します。説明は元文書字数が少ない版であるsakuhin_mini.txtとその学習結果のsakuhin_mini.hdf.kerasを使用しています。元文書から適当に取り出した初期の文字列より、LSTMモデルが「次に来そうな文字」を予測してランダム要素を加えて1文字を選んで、文章に加えます。これを繰り返すことで小説を執筆します。 ## 〇紹介動画はYoutubeのURLよりご視聴ください。 https://youtu.be/YHBkCNo8tAw ## 〇スライド形式pdf解説書です。 https://drive.google.com/file/d/1G9WPXXKR4-km9rFikkROyJnotYyIrH4X/view?usp=drive_link ## 〇学習元の文章です。 sakuhin_all.txt https://drive.google.com/file/d/1KgJG9D1SMNSTzg49vcv7KlWgCuK4KG9I/view?usp=drive_link ## 〇サンプルプログラム ``` !pip install mecab-python3 !pip install unidic !python -m unidic download !apt-get -q -y install mecab libmecab-dev file !git clone --depth 1 https://github.com/neologd/mecab-unidic-neologd.git !echo yes | mecab-unidic-neologd/bin/install-mecab-unidic-neologd -n ``` ``` from tensorflow import keras #kerasのインポート from tensorflow.keras import layers #kerasでlayerを作るためのインポート import numpy as np #numpyのインポート import random #乱数を扱うPython標準ライブラリ import sys #システムを扱うPython標準ライブラリ import io #ファイル入出力を扱う標準ライブラリ from google.colab import drive #Googleドライブのマウント drive.mount('/content/drive') neta_path="/content/drive/MyDrive/data_science/sakuhin_all.txt" #学習元文章のファイルパス hdf_path="/content/drive/MyDrive/data_science/sakuhin_all.hdf.keras" #学習結果保存パス #neta_path="/content/drive/MyDrive/data_science/sakuhin_mini.txt" #学習元文章のファイルパス #hdf_path="/content/drive/MyDrive/data_science/sakuhin_mini.hdf.keras" #学習結果保存パス with io.open(neta_path, encoding='utf-8') as f: #学習元文章の読み込み text = f.read().lower() print('文章の長さ:', len(text)) chars = sorted(list(set(text))) # textの重複削除と並び換え print('総文字数:', len(chars)) char_indices = dict((c, i) for i, c in enumerate(chars)) #文字に対応する数字の辞書作成 indices_char = dict((i, c) for i, c in enumerate(chars)) #逆引き辞書作成 print('辞書:',char_indices) print('逆引辞書:',indices_char) maxlen = 4 #生成文字数 yuragi=0.4 #生成文字ランダム度合 sentences = [] #maxlen長の文字を格納する作業用変数 # 学習済み結果のロード model_LSTM=keras.models.load_model(hdf_path) print('\n ＊＊＊　LSTMラーニング結果をロードしました : ',hdf_path) #確率分布に基づいて次の言葉をランダムに決定する def index_make(preds, temperature): print("i_preds hikisu,tempreture:",preds,temperature) preds = np.asarray(preds).astype('float64') #配列をnumpyに変換し、float64型にする print("i_preds float64:",preds) preds = np.log(preds) / temperature #確率の調整 print("i_preds log:",preds) exp_preds = np.exp(preds) #スケールを戻す print("i_exp_preds:",exp_preds) preds = exp_preds / np.sum(exp_preds) print("i_exp_preds2:",preds) probas = np.random.multinomial(1, preds, 1) #文字をランダムに選ぶ print("i_probas:",probas) return np.argmax(probas) def aip_sippitu(): #小説執筆 start_index = random.randint(0, len(text) - maxlen - 1) #書き始め文字の準備 generated = '' #執筆小説格納配列 print('\n＊＊＊　生成条件　＊＊＊') print("長さ:",maxlen) print('ゆらぎ:', yuragi) print("start_index:",start_index) sentence = text[start_index: start_index + maxlen] #書き始め文のセット print("start sentence:",sentence) generated += sentence #生成小説を格納する変数 print("start generated:",generated) print('キーワード:',generated) print('\n＊＊＊　あいぴが執筆します　＊＊＊') for i in range(100): #200文字生成 print('sentence:',sentence) x_pred = np.zeros((1, maxlen, len(chars))) #LSTM入力用配列作成 print("init_xpred:",x_pred) for t, char in enumerate(sentence): #LSTM入力文字をワンホットエンコーディングに変換 x_pred[0, t, char_indices[char]] = 1. #x_predがLSTMに渡す配列 print("for x_pred:",t,char,x_pred) preds = model_LSTM.predict(x_pred, verbose=0)[0] #LSTMで次の文字の確率を予測 print("LSTM preds:",preds) next_index = index_make(preds, yuragi) #index_makeで次の文字を選択yuragiでランダムさを調整 print("next_index:",next_index) next_char = indices_char[next_index] #辞書で実際の文字に変換 print("next_char:",next_char) generated += next_char #小説に文字を追加 sentence = sentence[1:] + next_char #一文字削って新しい生成文字を追加→次の予測準備 print("sentence:",sentence) print("generated:",generated) count=1 #生成小説の整形出力 for moji in generated: if moji=='「': #「の処理 print('\n') #"「"がくればまず改行する print (moji,end='') #”L”を出力し、改行しない count=1 elif moji=='」': #」の処理 print('」') count=1 elif moji=='\n': #改行の処理 moji=' ' else: print(moji,end='') if count % 25== 0: #25文字ごとに改行 print('\n',end='') count=1 count +=1 #main aip_sippitu() print('\n') ``` GoogleColaboratoryにアップロードすればすぐに動作を確認できます。実行結果のサンプル付きです。 https://drive.google.com/file/d/1UlTK4vGK_C-njH2U8v7ZvtyPo6Wp54ju/view?usp=drive_link ## 〇補足公開している動画と解説用pdfは電波新聞社刊行電子工作マガジンに連載された同題名の内容をGoogle NotebookLMにてまとめています