{"created":"2021-03-01T06:01:15.918688+00:00","id":8954,"links":{},"metadata":{"_buckets":{"deposit":"3cbcac06-23e9-4d56-aad6-ea58a7f4b32a"},"_deposit":{"id":"8954","owners":[],"pid":{"revision_id":0,"type":"depid","value":"8954"},"status":"published"},"_oai":{"id":"oai:kitami-it.repo.nii.ac.jp:00008954","sets":["1:86"]},"author_link":["273","90493","90494","90495","90496","90497","90498","90499"],"item_1646810750418":{"attribute_name":"出版タイプ","attribute_value_mlt":[{"subitem_version_resource":"http://purl.org/coar/version/c_970fb48d4fbd8a85","subitem_version_type":"VoR"}]},"item_3_alternative_title_198":{"attribute_name":"その他のタイトル","attribute_value_mlt":[{"subitem_alternative_title":"The Optimal Algorithms for the Reinforcement Learning Problem Separated into a Learning Period and a Control Period","subitem_alternative_title_language":"en"}]},"item_3_biblio_info_186":{"attribute_name":"書誌情報","attribute_value_mlt":[{"bibliographicIssueDates":{"bibliographicIssueDate":"1998-04-15","bibliographicIssueDateType":"Issued"},"bibliographicIssueNumber":"4","bibliographicPageEnd":"1126","bibliographicPageStart":"1116","bibliographicVolumeNumber":"39","bibliographic_titles":[{"bibliographic_title":"情報処理学会論文誌"},{"bibliographic_title":"Transactions of Information Processing Society of Japan","bibliographic_titleLang":"en"}]}]},"item_3_description_184":{"attribute_name":"抄録","attribute_value_mlt":[{"subitem_description":"本研究では,遷移確率行列が未知であるようなマルコフ決定過程によってモデル化されている,学習期間と制御期間に分割された強化学習問題における,最適アルゴリズムの提案を行っている.従来研究では,真の遷移確率行列を同定できれば制御期間の収益を最大化できるため,学習期間の目的を単に未知の遷移確率行列の推定としているが,有限の学習期間のもとでは推定誤差があるため,収益最大化の厳密な保証はない.そこで本研究では,有限の学習期間と有限の制御期間の強化学習問題において,制御期間の収益をベイズ基準のもとで最大化する基本最適アルゴリズムを提案する.しかし,基本最適アルゴリズムの計算量が指数オーダーのため,さらにその改良を行い,改良最適アルゴリズムを提案する.改良最適アルゴリズムは基本最適アルゴリズム同様に収益をベイズ基準のもとで最大化することができ,かつその計算量は多項式オーダーに軽減されている.","subitem_description_type":"Abstract"},{"subitem_description":"[ENG]\nIn this paper,new algorithms are proposed based on statistical decision theory in the field of Markov decision processes under the condition that a transition probability matrix is unknown.In previous researches on RL(reinforcement learning),learning is based on only the estimation of an unknown transition probability matrix and the maximum reward is not received in a finite period,though their purpose is to maximize a reward.In our algorithms it is possible to maximize the reward within a finite period with respect to Bayes criterion.Moreover, we propose some techniques to reduce the computational complexity of our algorithm from exponential order to polynomial order","subitem_description_type":"Abstract"}]},"item_3_publisher_212":{"attribute_name":"出版者","attribute_value_mlt":[{"subitem_publisher":"情報処理学会"}]},"item_3_relation_208":{"attribute_name":"論文ID(NAID)","attribute_value_mlt":[{"subitem_relation_type_id":{"subitem_relation_type_id_text":"110002722119","subitem_relation_type_select":"NAID"}}]},"item_3_select_195":{"attribute_name":"著者版フラグ","attribute_value_mlt":[{"subitem_select_item":"publisher"}]},"item_3_source_id_187":{"attribute_name":"ISSN","attribute_value_mlt":[{"subitem_source_identifier":"1882-7764","subitem_source_identifier_type":"PISSN"}]},"item_3_source_id_189":{"attribute_name":"書誌レコードID","attribute_value_mlt":[{"subitem_source_identifier":"AN00116647","subitem_source_identifier_type":"NCID"}]},"item_access_right":{"attribute_name":"アクセス権","attribute_value_mlt":[{"subitem_access_right":"open access","subitem_access_right_uri":"http://purl.org/coar/access_right/c_abf2"}]},"item_creator":{"attribute_name":"著者","attribute_type":"creator","attribute_value_mlt":[{"creatorNames":[{"creatorName":"前田, 康成","creatorNameLang":"ja"}],"nameIdentifiers":[{},{}]},{"creatorNames":[{"creatorName":"浮田, 善文","creatorNameLang":"ja"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"松嶋, 敏泰","creatorNameLang":"ja"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"平澤, 茂一","creatorNameLang":"ja"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"MAEDA, Yasunari","creatorNameLang":"en"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"UKITA, Yoshihumi","creatorNameLang":"en"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"MATSUSHIMA, Toshiyasu","creatorNameLang":"en"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"HIRASAWA, Shigeichi","creatorNameLang":"en"}],"nameIdentifiers":[{}]}]},"item_files":{"attribute_name":"ファイル情報","attribute_type":"file","attribute_value_mlt":[{"accessrole":"open_date","date":[{"dateType":"Available","dateValue":"2021-01-20"}],"displaytype":"detail","filename":"情報処理学会論文誌, 39(4), pp.1116-1126.pdf","filesize":[{"value":"1.3 MB"}],"format":"application/pdf","licensetype":"license_note","mimetype":"application/pdf","url":{"label":"情報処理学会論文誌, 39(4), pp.1116-1126","url":"https://kitami-it.repo.nii.ac.jp/record/8954/files/情報処理学会論文誌, 39(4), pp.1116-1126.pdf"},"version_id":"3a2e7452-3188-4193-af96-5dd34e88cf82"}]},"item_language":{"attribute_name":"言語","attribute_value_mlt":[{"subitem_language":"jpn"}]},"item_resource_type":{"attribute_name":"資源タイプ","attribute_value_mlt":[{"resourcetype":"journal article","resourceuri":"http://purl.org/coar/resource_type/c_6501"}]},"item_title":"学習期間と制御期間に分割された強化学習問題における最適アルゴリズムの提案","item_titles":{"attribute_name":"タイトル","attribute_value_mlt":[{"subitem_title":"学習期間と制御期間に分割された強化学習問題における最適アルゴリズムの提案","subitem_title_language":"ja"},{"subitem_title":"The Optimal Algorithms for the Reinforcement Learning Problem Separated into a Learning Period and a Control Period","subitem_title_language":"en"}]},"item_type_id":"3","owner":"1","path":["86"],"pubdate":{"attribute_name":"PubDate","attribute_value":"2021-01-20"},"publish_date":"2021-01-20","publish_status":"0","recid":"8954","relation_version_is_last":true,"title":["学習期間と制御期間に分割された強化学習問題における最適アルゴリズムの提案"],"weko_creator_id":"1","weko_shared_id":-1},"updated":"2022-12-13T02:22:36.082979+00:00"}