<div class="xblock xblock-public_view xblock-public_view-vertical" data-request-token="078e952803df11f0a0bf0affe2bbc7c1" data-usage-id="block-v1:MITx+6.036+1T2019+type@vertical+block@reinforcement_learning_notes" data-init="VerticalStudentView" data-block-type="vertical" data-runtime-class="LmsRuntime" data-course-id="course-v1:MITx+6.036+1T2019" data-has-score="False" data-graded="False" data-runtime-version="1">
<h2 class="hd hd-2 unit-title">Notes – Chapter 11: Reinforcement learning</h2>
<div class="vert-mod">
<div class="vert vert-0" data-id="block-v1:MITx+6.036+1T2019+type@html+block@reinforcement_learning_notes_top">
<div class="xblock xblock-public_view xblock-public_view-html xmodule_display xmodule_HtmlBlock" data-request-token="078e952803df11f0a0bf0affe2bbc7c1" data-usage-id="block-v1:MITx+6.036+1T2019+type@html+block@reinforcement_learning_notes_top" data-init="XBlockToXModuleShim" data-block-type="html" data-runtime-class="LmsRuntime" data-course-id="course-v1:MITx+6.036+1T2019" data-has-score="False" data-graded="False" data-runtime-version="1">
<script type="json/xblock-args" class="xblock-json-init-args">
{"xmodule-type": "HTMLModule"}
</script>
<p>
You can sequence through the Reinforcement learning lecture video and note segments (go to Next page). </p><p>
You can also (or alternatively) download the <a href="/assets/courseware/v1/153f87d9a11295896ffa8215253bf354/asset-v1:MITx+6.036+1T2019+type@asset+block/notes_chapter_Reinforcement_learning.pdf" target="_blank">Chapter 11: Reinforcement learning</a> notes as a PDF file. </p>
</div>
</div>
</div>
</div>
<div class="xblock xblock-public_view xblock-public_view-vertical" data-request-token="078e952803df11f0a0bf0affe2bbc7c1" data-usage-id="block-v1:MITx+6.036+1T2019+type@vertical+block@MIT6036L09a_vert" data-init="VerticalStudentView" data-block-type="vertical" data-runtime-class="LmsRuntime" data-course-id="course-v1:MITx+6.036+1T2019" data-has-score="False" data-graded="False" data-runtime-version="1">
<h2 class="hd hd-2 unit-title">Lecture: Introduction to reinforcement learning</h2>
<div class="vert-mod">
<div class="vert vert-0" data-id="block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09a">
<div class="xblock xblock-public_view xblock-public_view-video xmodule_display xmodule_VideoBlock" data-request-token="078e952803df11f0a0bf0affe2bbc7c1" data-usage-id="block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09a" data-init="XBlockToXModuleShim" data-block-type="video" data-runtime-class="LmsRuntime" data-course-id="course-v1:MITx+6.036+1T2019" data-has-score="False" data-graded="False" data-runtime-version="1">
<script type="json/xblock-args" class="xblock-json-init-args">
{"xmodule-type": "Video"}
</script>
<h3 class="hd hd-2">Lecture: Introduction to reinforcement learning</h3>
<div
id="video_MIT6036L09a"
class="video closed"
data-metadata='{"autoAdvance": false, "prioritizeHls": false, "recordedYoutubeIsAvailable": true, "ytTestTimeout": 1500, "poster": null, "streams": "1.00:RD8c9atplDU", "saveStateEnabled": false, "end": 0.0, "speed": null, "completionPercentage": 0.95, "start": 0.0, "publishCompletionUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09a/handler/publish_completion", "duration": 0.0, "autoplay": false, "savedVideoPosition": 0.0, "generalSpeed": 1.0, "autohideHtml5": false, "ytMetadataEndpoint": "", "transcriptTranslationUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09a/handler/transcript/translation/__lang__", "showCaptions": "true", "completionEnabled": false, "captionDataDir": null, "ytApiUrl": "https://www.youtube.com/iframe_api", "saveStateUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09a/handler/xmodule_handler/save_user_state", "transcriptAvailableTranslationsUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09a/handler/transcript/available_translations", "sources": [], "transcriptLanguages": {"en": "English"}, "transcriptLanguage": "en", "lmsRootURL": "https://openlearninglibrary.mit.edu"}'
data-bumper-metadata='null'
data-autoadvance-enabled="False"
data-poster='null'
tabindex="-1"
>
<div class="focus_grabber first"></div>
<div class="tc-wrapper">
<div class="video-wrapper">
<span tabindex="0" class="spinner" aria-hidden="false" aria-label="Loading video player"></span>
<span tabindex="-1" class="btn-play fa fa-youtube-play fa-2x is-hidden" aria-hidden="true" aria-label="Play video"></span>
<div class="video-player-pre"></div>
<div class="video-player">
<div id="MIT6036L09a"></div>
<h4 class="hd hd-4 video-error is-hidden">No playable video sources found.</h4>
<h4 class="hd hd-4 video-hls-error is-hidden">
Your browser does not support this video format. Try using a different browser.
</h4>
</div>
<div class="video-player-post"></div>
<div class="closed-captions"></div>
<div class="video-controls is-hidden">
<div>
<div class="vcr"><div class="vidtime">0:00 / 0:00</div></div>
<div class="secondary-controls"></div>
</div>
</div>
</div>
</div>
<div class="focus_grabber last"></div>
</div>
</div>
</div>
</div>
</div>
<div class="xblock xblock-public_view xblock-public_view-vertical" data-request-token="078e952803df11f0a0bf0affe2bbc7c1" data-usage-id="block-v1:MITx+6.036+1T2019+type@vertical+block@reinforcement_learning_top_vert" data-init="VerticalStudentView" data-block-type="vertical" data-runtime-class="LmsRuntime" data-course-id="course-v1:MITx+6.036+1T2019" data-has-score="False" data-graded="False" data-runtime-version="1">
<h2 class="hd hd-2 unit-title">Introduction to reinforcement learning</h2>
<div class="vert-mod">
<div class="vert vert-0" data-id="block-v1:MITx+6.036+1T2019+type@html+block@reinforcement_learning_top">
<div class="xblock xblock-public_view xblock-public_view-html xmodule_display xmodule_HtmlBlock" data-request-token="078e952803df11f0a0bf0affe2bbc7c1" data-usage-id="block-v1:MITx+6.036+1T2019+type@html+block@reinforcement_learning_top" data-init="XBlockToXModuleShim" data-block-type="html" data-runtime-class="LmsRuntime" data-course-id="course-v1:MITx+6.036+1T2019" data-has-score="False" data-graded="False" data-runtime-version="1">
<script type="json/xblock-args" class="xblock-json-init-args">
{"xmodule-type": "HTMLModule"}
</script>
<p>
So far, all the learning problems we have looked at have been <em>supervised</em>: that is, for each training input [mathjaxinline]x^{(i)}[/mathjaxinline], we are told which value [mathjaxinline]y^{(i)}[/mathjaxinline] should be the output. A very different problem setting is <em>reinforcement learning</em>, in which the learning system is not directly told which outputs go with which inputs. Instead, there is an interaction of the form: Learner observes <em>input</em> [mathjaxinline]s^{(i)}[/mathjaxinline] Learner generates <em>output</em> [mathjaxinline]a^{(i)}[/mathjaxinline] Learner observes <em>reward</em> [mathjaxinline]r^{(i)}[/mathjaxinline] Learner observes <em>input</em> [mathjaxinline]s^{(i+1)}[/mathjaxinline] Learner generates <em>output</em> [mathjaxinline]a^{(i+1)}[/mathjaxinline] Learner observes <em>reward</em> [mathjaxinline]r^{(i+1)}[/mathjaxinline] [mathjaxinline]\ldots[/mathjaxinline] The learner is supposed to find a <em>policy</em>, mapping [mathjaxinline]s[/mathjaxinline] to [mathjaxinline]a[/mathjaxinline], that maximizes expected reward over time. </p><center><p><img src="/assets/courseware/v1/968946282a186c6c80c9b231f51e2004/asset-v1:MITx+6.036+1T2019+type@asset+block/images_reinforcement_learning_top_tikzpicture_1-crop.png" width="408"/></p></center><p>
This problem setting is equivalent to an <em>online</em> supervised learning under the following assumptions: </p><ol class="enumerate"><li value="1"><p>
The space of possible outputs is binary (e.g. [mathjaxinline]\{ +1, -1\}[/mathjaxinline]) and the space of possible rewards is binary (e.g. [mathjaxinline]\{ +1, -1\}[/mathjaxinline]); </p></li><li value="2"><p>
[mathjaxinline]s^{(i)}[/mathjaxinline] is independent of all previous [mathjaxinline]s^{(j)}[/mathjaxinline] and [mathjaxinline]a^{(j)}[/mathjaxinline]; and </p></li><li value="3"><p>
[mathjaxinline]r^{(i)}[/mathjaxinline] depends only on [mathjaxinline]s^{(i)}[/mathjaxinline] and [mathjaxinline]a^{(i)}[/mathjaxinline]. </p></li></ol><p>
In this case, for any experience tuple [mathjaxinline](s^{(i)}, a^{(i)}, r^{(i)})[/mathjaxinline], we can generate a supervised training example, which is equal to [mathjaxinline](s^{(i)}, a^{(i)})[/mathjaxinline] if [mathjaxinline]r^{(i)} = +1[/mathjaxinline] and [mathjaxinline](s^{(i)}, -a^{(i)})[/mathjaxinline] otherwise. <br/> <br/><span style="color:#FF0000"><b class="bf">Study Question:</b></span> <span style="color:#0000FF">What supervised-learning loss function would this objective correspond to?</span> <br/></p><p>
Reinforcement learning is more interesting when these properties do not hold. When we relax assumption 1 above, we have the class of <em>bandit problems</em>, which we will discuss in section <ref/>. If we relax assumption 2, but assume that the environment that the agent is interacting with is an <i class="sc">mdp</i>, so that [mathjaxinline]s^{(i)}[/mathjaxinline] depends only on [mathjaxinline]s^{(i-1)}[/mathjaxinline] and [mathjaxinline]a^{(i-1)}[/mathjaxinline] then we are in the classical <em>reinforcement-learning</em> setting, which we discuss in section <ref/>. Weakening the assumptions further, for instance, not allowing the learner to observe the current state completely and correctly, makes the problem into a <em>partially observed MDP</em> (<i class="sc">pomdp</i>), which is substantially more difficult, and beyond the scope of this class. </p><p>
<br/></p><p>
<br/></p><p><a href="/assets/courseware/v1/153f87d9a11295896ffa8215253bf354/asset-v1:MITx+6.036+1T2019+type@asset+block/notes_chapter_Reinforcement_learning.pdf" target="_blank">Download this chapter as a PDF file</a></p><span><br/><span style="color:gray;font-size:10pt"><center>This page was last updated on Saturday November 16, 2019; 07:31:37 PM (revision f808f068e)</center></span></span>
</div>
</div>
</div>
</div>
<div class="xblock xblock-public_view xblock-public_view-vertical" data-request-token="078e952803df11f0a0bf0affe2bbc7c1" data-usage-id="block-v1:MITx+6.036+1T2019+type@vertical+block@MIT6036L09b_vert" data-init="VerticalStudentView" data-block-type="vertical" data-runtime-class="LmsRuntime" data-course-id="course-v1:MITx+6.036+1T2019" data-has-score="False" data-graded="False" data-runtime-version="1">
<h2 class="hd hd-2 unit-title">Lecture: K-armed bandits</h2>
<div class="vert-mod">
<div class="vert vert-0" data-id="block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09b">
<div class="xblock xblock-public_view xblock-public_view-video xmodule_display xmodule_VideoBlock" data-request-token="078e952803df11f0a0bf0affe2bbc7c1" data-usage-id="block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09b" data-init="XBlockToXModuleShim" data-block-type="video" data-runtime-class="LmsRuntime" data-course-id="course-v1:MITx+6.036+1T2019" data-has-score="False" data-graded="False" data-runtime-version="1">
<script type="json/xblock-args" class="xblock-json-init-args">
{"xmodule-type": "Video"}
</script>
<h3 class="hd hd-2">Lecture: K-armed bandits</h3>
<div
id="video_MIT6036L09b"
class="video closed"
data-metadata='{"autoAdvance": false, "prioritizeHls": false, "recordedYoutubeIsAvailable": true, "ytTestTimeout": 1500, "poster": null, "streams": "1.00:Jq15p4Jt3x8", "saveStateEnabled": false, "end": 0.0, "speed": null, "completionPercentage": 0.95, "start": 0.0, "publishCompletionUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09b/handler/publish_completion", "duration": 0.0, "autoplay": false, "savedVideoPosition": 0.0, "generalSpeed": 1.0, "autohideHtml5": false, "ytMetadataEndpoint": "", "transcriptTranslationUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09b/handler/transcript/translation/__lang__", "showCaptions": "true", "completionEnabled": false, "captionDataDir": null, "ytApiUrl": "https://www.youtube.com/iframe_api", "saveStateUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09b/handler/xmodule_handler/save_user_state", "transcriptAvailableTranslationsUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09b/handler/transcript/available_translations", "sources": [], "transcriptLanguages": {"en": "English"}, "transcriptLanguage": "en", "lmsRootURL": "https://openlearninglibrary.mit.edu"}'
data-bumper-metadata='null'
data-autoadvance-enabled="False"
data-poster='null'
tabindex="-1"
>
<div class="focus_grabber first"></div>
<div class="tc-wrapper">
<div class="video-wrapper">
<span tabindex="0" class="spinner" aria-hidden="false" aria-label="Loading video player"></span>
<span tabindex="-1" class="btn-play fa fa-youtube-play fa-2x is-hidden" aria-hidden="true" aria-label="Play video"></span>
<div class="video-player-pre"></div>
<div class="video-player">
<div id="MIT6036L09b"></div>
<h4 class="hd hd-4 video-error is-hidden">No playable video sources found.</h4>
<h4 class="hd hd-4 video-hls-error is-hidden">
Your browser does not support this video format. Try using a different browser.
</h4>
</div>
<div class="video-player-post"></div>
<div class="closed-captions"></div>
<div class="video-controls is-hidden">
<div>
<div class="vcr"><div class="vidtime">0:00 / 0:00</div></div>
<div class="secondary-controls"></div>
</div>
</div>
</div>
</div>
<div class="focus_grabber last"></div>
</div>
</div>
</div>
</div>
</div>
<div class="xblock xblock-public_view xblock-public_view-vertical" data-request-token="078e952803df11f0a0bf0affe2bbc7c1" data-usage-id="block-v1:MITx+6.036+1T2019+type@vertical+block@reinforcement_learning_bandit_problems_vert" data-init="VerticalStudentView" data-block-type="vertical" data-runtime-class="LmsRuntime" data-course-id="course-v1:MITx+6.036+1T2019" data-has-score="False" data-graded="False" data-runtime-version="1">
<h2 class="hd hd-2 unit-title">Bandit problems</h2>
<div class="vert-mod">
<div class="vert vert-0" data-id="block-v1:MITx+6.036+1T2019+type@html+block@reinforcement_learning_bandit_problems">
<div class="xblock xblock-public_view xblock-public_view-html xmodule_display xmodule_HtmlBlock" data-request-token="078e952803df11f0a0bf0affe2bbc7c1" data-usage-id="block-v1:MITx+6.036+1T2019+type@html+block@reinforcement_learning_bandit_problems" data-init="XBlockToXModuleShim" data-block-type="html" data-runtime-class="LmsRuntime" data-course-id="course-v1:MITx+6.036+1T2019" data-has-score="False" data-graded="False" data-runtime-version="1">
<script type="json/xblock-args" class="xblock-json-init-args">
{"xmodule-type": "HTMLModule"}
</script>
<p>
A basic bandit problem is given by </p><ul class="itemize"><li><p>
A set of actions [mathjaxinline]\mathcal A[/mathjaxinline]; </p></li><li><p>
A set of reward values [mathjaxinline]\mathcal R[/mathjaxinline]; and </p></li><li><p>
A probabilistic reward function [mathjaxinline]R: A \rightarrow \mathbb {R}[/mathjaxinline] where [mathjaxinline]R(a) = P(R \mid A = a)[/mathjaxinline] is a probability distribution over possible reward values in [mathjaxinline]\mathcal R[/mathjaxinline] conditioned on which action is selected. </p></li></ul><p>
The most typical bandit problem has [mathjaxinline]\mathcal R = \{ 0, 1\}[/mathjaxinline] and [mathjaxinline]\lvert \mathcal A \rvert = k[/mathjaxinline]. This is called a <em>[mathjaxinline]k[/mathjaxinline]-armed bandit problem</em>. <span options="" class="marginote"><span class="marginote_desc" style="display:none">Why? Because in English slang, “one-armed bandit" is a name for a slot machine (an old-style gambling machine where you put a coin into a slot and then pull its arm to see if you get a payoff.) because it has one arm and takes your money! What we have here is a similar sort of machine, but with [mathjaxinline]k[/mathjaxinline] arms.</span><span>note</span></span> There is a lot of mathematical literature on optimal strategies for [mathjaxinline]k[/mathjaxinline]-armed bandit problems under various assumptions. The important question is usually one of <em>exploration versus exploitation</em>. Imagine that you have tried each action 10 times, and now you have an estimate [mathjaxinline]\hat{p}_ j[/mathjaxinline] for the expected value of [mathjaxinline]R(a_ j)[/mathjaxinline]. Which arm should you pick next? You could </p><p><img src="/assets/courseware/v1/0112ddc144efc1296691498fb23df5e3/asset-v1:MITx+6.036+1T2019+type@asset+block/images_reinforcement_learning_bandit_problems_description_1-crop.png" width="895"/></p><p>
The theory ultimately tells us that, the longer our horizon [mathjaxinline]H[/mathjaxinline] (or, similarly, closer to [mathjaxinline]1[/mathjaxinline] our discount factor), the more time we should spend exploring, so that we don't converge prematurely on a bad choice of action. <br/> <br/><span style="color:#FF0000"><b class="bf">Study Question:</b></span> <span style="color:#0000FF">Why is it that “bad" luck during exploration is more dangerous than “good" luck? Imagine that there is an action that generates reward value 1 with probability 0.9, but the first three times you try it, it generates value 0. How might that cause difficulty? Why is this more dangerous than the situation when an action that generates reward value 1 with probability 0.1 actually generates reward 1 on the first three tries? </span> <br/> <span options="" class="marginote"><span class="marginote_desc" style="display:none">There is a setting of supervised learning, called <em>active learning</em>, where instead of being given a training set, the learner gets to select values of [mathjaxinline]x[/mathjaxinline] and the environment gives back a label [mathjaxinline]y[/mathjaxinline]; the problem of picking good [mathjaxinline]x[/mathjaxinline] values to query is interesting, but the problem of deriving a hypothesis from [mathjaxinline](x, y)[/mathjaxinline] pairs is the same as the supervised problem we have been studying.</span><span>note</span></span> Note that what makes this a very different kind of problem from the batch supervised learning setting is that: </p><ul class="itemize"><li><p>
The agent gets to influence what data it gets (selecting [mathjaxinline]a_ j[/mathjaxinline] gives it another sample from [mathjaxinline]r_ j[/mathjaxinline]), and </p></li><li><p>
The agent is penalized for mistakes it makes while it is learning (if it is trying to maximize the expected sum of [mathjaxinline]r_ t[/mathjaxinline] it gets while behaving). </p></li></ul><p>
In a <em>contextual</em> bandit problem, you have multiple possible states, drawn from some set [mathjaxinline]\mathcal S[/mathjaxinline], and a separate bandit problem associated with each one. </p><p>
Bandit problems will be an essential sub-component of reinforcement learning. <br/></p><p>
<br/></p><p><a href="/assets/courseware/v1/153f87d9a11295896ffa8215253bf354/asset-v1:MITx+6.036+1T2019+type@asset+block/notes_chapter_Reinforcement_learning.pdf" target="_blank">Download this chapter as a PDF file</a></p><script src="/assets/courseware/v1/1ab2c06aefab58693cfc9c10394b7503/asset-v1:MITx+6.036+1T2019+type@asset+block/marginotes.js" type="text/javascript"/><span><br/><span style="color:gray;font-size:10pt"><center>This page was last updated on Friday May 24, 2019; 02:29:32 PM (revision 4f166135)</center></span></span>
</div>
</div>
</div>
</div>
<div class="xblock xblock-public_view xblock-public_view-vertical" data-request-token="078e952803df11f0a0bf0affe2bbc7c1" data-usage-id="block-v1:MITx+6.036+1T2019+type@vertical+block@MIT6036L09c_vert" data-init="VerticalStudentView" data-block-type="vertical" data-runtime-class="LmsRuntime" data-course-id="course-v1:MITx+6.036+1T2019" data-has-score="False" data-graded="False" data-runtime-version="1">
<h2 class="hd hd-2 unit-title">Lecture: Objectives of the reinforcement learning problem</h2>
<div class="vert-mod">
<div class="vert vert-0" data-id="block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09c">
<div class="xblock xblock-public_view xblock-public_view-video xmodule_display xmodule_VideoBlock" data-request-token="078e952803df11f0a0bf0affe2bbc7c1" data-usage-id="block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09c" data-init="XBlockToXModuleShim" data-block-type="video" data-runtime-class="LmsRuntime" data-course-id="course-v1:MITx+6.036+1T2019" data-has-score="False" data-graded="False" data-runtime-version="1">
<script type="json/xblock-args" class="xblock-json-init-args">
{"xmodule-type": "Video"}
</script>
<h3 class="hd hd-2">Lecture: Objectives of the reinforcement learning problem</h3>
<div
id="video_MIT6036L09c"
class="video closed"
data-metadata='{"autoAdvance": false, "prioritizeHls": false, "recordedYoutubeIsAvailable": true, "ytTestTimeout": 1500, "poster": null, "streams": "1.00:0DAzFI69Q44", "saveStateEnabled": false, "end": 0.0, "speed": null, "completionPercentage": 0.95, "start": 0.0, "publishCompletionUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09c/handler/publish_completion", "duration": 0.0, "autoplay": false, "savedVideoPosition": 0.0, "generalSpeed": 1.0, "autohideHtml5": false, "ytMetadataEndpoint": "", "transcriptTranslationUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09c/handler/transcript/translation/__lang__", "showCaptions": "true", "completionEnabled": false, "captionDataDir": null, "ytApiUrl": "https://www.youtube.com/iframe_api", "saveStateUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09c/handler/xmodule_handler/save_user_state", "transcriptAvailableTranslationsUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09c/handler/transcript/available_translations", "sources": [], "transcriptLanguages": {"en": "English"}, "transcriptLanguage": "en", "lmsRootURL": "https://openlearninglibrary.mit.edu"}'
data-bumper-metadata='null'
data-autoadvance-enabled="False"
data-poster='null'
tabindex="-1"
>
<div class="focus_grabber first"></div>
<div class="tc-wrapper">
<div class="video-wrapper">
<span tabindex="0" class="spinner" aria-hidden="false" aria-label="Loading video player"></span>
<span tabindex="-1" class="btn-play fa fa-youtube-play fa-2x is-hidden" aria-hidden="true" aria-label="Play video"></span>
<div class="video-player-pre"></div>
<div class="video-player">
<div id="MIT6036L09c"></div>
<h4 class="hd hd-4 video-error is-hidden">No playable video sources found.</h4>
<h4 class="hd hd-4 video-hls-error is-hidden">
Your browser does not support this video format. Try using a different browser.
</h4>
</div>
<div class="video-player-post"></div>
<div class="closed-captions"></div>
<div class="video-controls is-hidden">
<div>
<div class="vcr"><div class="vidtime">0:00 / 0:00</div></div>
<div class="secondary-controls"></div>
</div>
</div>
</div>
</div>
<div class="focus_grabber last"></div>
</div>
</div>
</div>
</div>
</div>
<div class="xblock xblock-public_view xblock-public_view-vertical" data-request-token="078e952803df11f0a0bf0affe2bbc7c1" data-usage-id="block-v1:MITx+6.036+1T2019+type@vertical+block@MIT6036L09d_vert" data-init="VerticalStudentView" data-block-type="vertical" data-runtime-class="LmsRuntime" data-course-id="course-v1:MITx+6.036+1T2019" data-has-score="False" data-graded="False" data-runtime-version="1">
<h2 class="hd hd-2 unit-title">Lecture: Model-based learning</h2>
<div class="vert-mod">
<div class="vert vert-0" data-id="block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09d">
<div class="xblock xblock-public_view xblock-public_view-video xmodule_display xmodule_VideoBlock" data-request-token="078e952803df11f0a0bf0affe2bbc7c1" data-usage-id="block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09d" data-init="XBlockToXModuleShim" data-block-type="video" data-runtime-class="LmsRuntime" data-course-id="course-v1:MITx+6.036+1T2019" data-has-score="False" data-graded="False" data-runtime-version="1">
<script type="json/xblock-args" class="xblock-json-init-args">
{"xmodule-type": "Video"}
</script>
<h3 class="hd hd-2">Lecture: Model-based learning</h3>
<div
id="video_MIT6036L09d"
class="video closed"
data-metadata='{"autoAdvance": false, "prioritizeHls": false, "recordedYoutubeIsAvailable": true, "ytTestTimeout": 1500, "poster": null, "streams": "1.00:gTnTyPRPUJo", "saveStateEnabled": false, "end": 0.0, "speed": null, "completionPercentage": 0.95, "start": 0.0, "publishCompletionUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09d/handler/publish_completion", "duration": 0.0, "autoplay": false, "savedVideoPosition": 0.0, "generalSpeed": 1.0, "autohideHtml5": false, "ytMetadataEndpoint": "", "transcriptTranslationUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09d/handler/transcript/translation/__lang__", "showCaptions": "true", "completionEnabled": false, "captionDataDir": null, "ytApiUrl": "https://www.youtube.com/iframe_api", "saveStateUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09d/handler/xmodule_handler/save_user_state", "transcriptAvailableTranslationsUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09d/handler/transcript/available_translations", "sources": [], "transcriptLanguages": {"en": "English"}, "transcriptLanguage": "en", "lmsRootURL": "https://openlearninglibrary.mit.edu"}'
data-bumper-metadata='null'
data-autoadvance-enabled="False"
data-poster='null'
tabindex="-1"
>
<div class="focus_grabber first"></div>
<div class="tc-wrapper">
<div class="video-wrapper">
<span tabindex="0" class="spinner" aria-hidden="false" aria-label="Loading video player"></span>
<span tabindex="-1" class="btn-play fa fa-youtube-play fa-2x is-hidden" aria-hidden="true" aria-label="Play video"></span>
<div class="video-player-pre"></div>
<div class="video-player">
<div id="MIT6036L09d"></div>
<h4 class="hd hd-4 video-error is-hidden">No playable video sources found.</h4>
<h4 class="hd hd-4 video-hls-error is-hidden">
Your browser does not support this video format. Try using a different browser.
</h4>
</div>
<div class="video-player-post"></div>
<div class="closed-captions"></div>
<div class="video-controls is-hidden">
<div>
<div class="vcr"><div class="vidtime">0:00 / 0:00</div></div>
<div class="secondary-controls"></div>
</div>
</div>
</div>
</div>
<div class="focus_grabber last"></div>
</div>
</div>
</div>
</div>
</div>
<div class="xblock xblock-public_view xblock-public_view-vertical" data-request-token="078e952803df11f0a0bf0affe2bbc7c1" data-usage-id="block-v1:MITx+6.036+1T2019+type@vertical+block@MIT6036L09e_vert" data-init="VerticalStudentView" data-block-type="vertical" data-runtime-class="LmsRuntime" data-course-id="course-v1:MITx+6.036+1T2019" data-has-score="False" data-graded="False" data-runtime-version="1">
<h2 class="hd hd-2 unit-title">Lecture: Policy search</h2>
<div class="vert-mod">
<div class="vert vert-0" data-id="block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09e">
<div class="xblock xblock-public_view xblock-public_view-video xmodule_display xmodule_VideoBlock" data-request-token="078e952803df11f0a0bf0affe2bbc7c1" data-usage-id="block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09e" data-init="XBlockToXModuleShim" data-block-type="video" data-runtime-class="LmsRuntime" data-course-id="course-v1:MITx+6.036+1T2019" data-has-score="False" data-graded="False" data-runtime-version="1">
<script type="json/xblock-args" class="xblock-json-init-args">
{"xmodule-type": "Video"}
</script>
<h3 class="hd hd-2">Lecture: Policy search</h3>
<div
id="video_MIT6036L09e"
class="video closed"
data-metadata='{"autoAdvance": false, "prioritizeHls": false, "recordedYoutubeIsAvailable": true, "ytTestTimeout": 1500, "poster": null, "streams": "1.00:-oGUCUbWHdQ", "saveStateEnabled": false, "end": 0.0, "speed": null, "completionPercentage": 0.95, "start": 0.0, "publishCompletionUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09e/handler/publish_completion", "duration": 0.0, "autoplay": false, "savedVideoPosition": 0.0, "generalSpeed": 1.0, "autohideHtml5": false, "ytMetadataEndpoint": "", "transcriptTranslationUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09e/handler/transcript/translation/__lang__", "showCaptions": "true", "completionEnabled": false, "captionDataDir": null, "ytApiUrl": "https://www.youtube.com/iframe_api", "saveStateUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09e/handler/xmodule_handler/save_user_state", "transcriptAvailableTranslationsUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09e/handler/transcript/available_translations", "sources": [], "transcriptLanguages": {"en": "English"}, "transcriptLanguage": "en", "lmsRootURL": "https://openlearninglibrary.mit.edu"}'
data-bumper-metadata='null'
data-autoadvance-enabled="False"
data-poster='null'
tabindex="-1"
>
<div class="focus_grabber first"></div>
<div class="tc-wrapper">
<div class="video-wrapper">
<span tabindex="0" class="spinner" aria-hidden="false" aria-label="Loading video player"></span>
<span tabindex="-1" class="btn-play fa fa-youtube-play fa-2x is-hidden" aria-hidden="true" aria-label="Play video"></span>
<div class="video-player-pre"></div>
<div class="video-player">
<div id="MIT6036L09e"></div>
<h4 class="hd hd-4 video-error is-hidden">No playable video sources found.</h4>
<h4 class="hd hd-4 video-hls-error is-hidden">
Your browser does not support this video format. Try using a different browser.
</h4>
</div>
<div class="video-player-post"></div>
<div class="closed-captions"></div>
<div class="video-controls is-hidden">
<div>
<div class="vcr"><div class="vidtime">0:00 / 0:00</div></div>
<div class="secondary-controls"></div>
</div>
</div>
</div>
</div>
<div class="focus_grabber last"></div>
</div>
</div>
</div>
</div>
</div>
<div class="xblock xblock-public_view xblock-public_view-vertical" data-request-token="078e952803df11f0a0bf0affe2bbc7c1" data-usage-id="block-v1:MITx+6.036+1T2019+type@vertical+block@MIT6036L09f_vert" data-init="VerticalStudentView" data-block-type="vertical" data-runtime-class="LmsRuntime" data-course-id="course-v1:MITx+6.036+1T2019" data-has-score="False" data-graded="False" data-runtime-version="1">
<h2 class="hd hd-2 unit-title">Lecture: Q-learning</h2>
<div class="vert-mod">
<div class="vert vert-0" data-id="block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09f">
<div class="xblock xblock-public_view xblock-public_view-video xmodule_display xmodule_VideoBlock" data-request-token="078e952803df11f0a0bf0affe2bbc7c1" data-usage-id="block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09f" data-init="XBlockToXModuleShim" data-block-type="video" data-runtime-class="LmsRuntime" data-course-id="course-v1:MITx+6.036+1T2019" data-has-score="False" data-graded="False" data-runtime-version="1">
<script type="json/xblock-args" class="xblock-json-init-args">
{"xmodule-type": "Video"}
</script>
<h3 class="hd hd-2">Lecture: Q-learning</h3>
<div
id="video_MIT6036L09f"
class="video closed"
data-metadata='{"autoAdvance": false, "prioritizeHls": false, "recordedYoutubeIsAvailable": true, "ytTestTimeout": 1500, "poster": null, "streams": "1.00:YMQlGyB37g8", "saveStateEnabled": false, "end": 0.0, "speed": null, "completionPercentage": 0.95, "start": 0.0, "publishCompletionUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09f/handler/publish_completion", "duration": 0.0, "autoplay": false, "savedVideoPosition": 0.0, "generalSpeed": 1.0, "autohideHtml5": false, "ytMetadataEndpoint": "", "transcriptTranslationUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09f/handler/transcript/translation/__lang__", "showCaptions": "true", "completionEnabled": false, "captionDataDir": null, "ytApiUrl": "https://www.youtube.com/iframe_api", "saveStateUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09f/handler/xmodule_handler/save_user_state", "transcriptAvailableTranslationsUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09f/handler/transcript/available_translations", "sources": [], "transcriptLanguages": {"en": "English"}, "transcriptLanguage": "en", "lmsRootURL": "https://openlearninglibrary.mit.edu"}'
data-bumper-metadata='null'
data-autoadvance-enabled="False"
data-poster='null'
tabindex="-1"
>
<div class="focus_grabber first"></div>
<div class="tc-wrapper">
<div class="video-wrapper">
<span tabindex="0" class="spinner" aria-hidden="false" aria-label="Loading video player"></span>
<span tabindex="-1" class="btn-play fa fa-youtube-play fa-2x is-hidden" aria-hidden="true" aria-label="Play video"></span>
<div class="video-player-pre"></div>
<div class="video-player">
<div id="MIT6036L09f"></div>
<h4 class="hd hd-4 video-error is-hidden">No playable video sources found.</h4>
<h4 class="hd hd-4 video-hls-error is-hidden">
Your browser does not support this video format. Try using a different browser.
</h4>
</div>
<div class="video-player-post"></div>
<div class="closed-captions"></div>
<div class="video-controls is-hidden">
<div>
<div class="vcr"><div class="vidtime">0:00 / 0:00</div></div>
<div class="secondary-controls"></div>
</div>
</div>
</div>
</div>
<div class="focus_grabber last"></div>
</div>
</div>
</div>
</div>
</div>
<div class="xblock xblock-public_view xblock-public_view-vertical" data-request-token="078e952803df11f0a0bf0affe2bbc7c1" data-usage-id="block-v1:MITx+6.036+1T2019+type@vertical+block@MIT6036L09g_vert" data-init="VerticalStudentView" data-block-type="vertical" data-runtime-class="LmsRuntime" data-course-id="course-v1:MITx+6.036+1T2019" data-has-score="False" data-graded="False" data-runtime-version="1">
<h2 class="hd hd-2 unit-title">Lecture: Q-learning select-action strategies</h2>
<div class="vert-mod">
<div class="vert vert-0" data-id="block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09g">
<div class="xblock xblock-public_view xblock-public_view-video xmodule_display xmodule_VideoBlock" data-request-token="078e952803df11f0a0bf0affe2bbc7c1" data-usage-id="block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09g" data-init="XBlockToXModuleShim" data-block-type="video" data-runtime-class="LmsRuntime" data-course-id="course-v1:MITx+6.036+1T2019" data-has-score="False" data-graded="False" data-runtime-version="1">
<script type="json/xblock-args" class="xblock-json-init-args">
{"xmodule-type": "Video"}
</script>
<h3 class="hd hd-2">Lecture: Q-learning select-action strategies</h3>
<div
id="video_MIT6036L09g"
class="video closed"
data-metadata='{"autoAdvance": false, "prioritizeHls": false, "recordedYoutubeIsAvailable": true, "ytTestTimeout": 1500, "poster": null, "streams": "1.00:fEcP3XtoLDc", "saveStateEnabled": false, "end": 0.0, "speed": null, "completionPercentage": 0.95, "start": 0.0, "publishCompletionUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09g/handler/publish_completion", "duration": 0.0, "autoplay": false, "savedVideoPosition": 0.0, "generalSpeed": 1.0, "autohideHtml5": false, "ytMetadataEndpoint": "", "transcriptTranslationUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09g/handler/transcript/translation/__lang__", "showCaptions": "true", "completionEnabled": false, "captionDataDir": null, "ytApiUrl": "https://www.youtube.com/iframe_api", "saveStateUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09g/handler/xmodule_handler/save_user_state", "transcriptAvailableTranslationsUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09g/handler/transcript/available_translations", "sources": [], "transcriptLanguages": {"en": "English"}, "transcriptLanguage": "en", "lmsRootURL": "https://openlearninglibrary.mit.edu"}'
data-bumper-metadata='null'
data-autoadvance-enabled="False"
data-poster='null'
tabindex="-1"
>
<div class="focus_grabber first"></div>
<div class="tc-wrapper">
<div class="video-wrapper">
<span tabindex="0" class="spinner" aria-hidden="false" aria-label="Loading video player"></span>
<span tabindex="-1" class="btn-play fa fa-youtube-play fa-2x is-hidden" aria-hidden="true" aria-label="Play video"></span>
<div class="video-player-pre"></div>
<div class="video-player">
<div id="MIT6036L09g"></div>
<h4 class="hd hd-4 video-error is-hidden">No playable video sources found.</h4>
<h4 class="hd hd-4 video-hls-error is-hidden">
Your browser does not support this video format. Try using a different browser.
</h4>
</div>
<div class="video-player-post"></div>
<div class="closed-captions"></div>
<div class="video-controls is-hidden">
<div>
<div class="vcr"><div class="vidtime">0:00 / 0:00</div></div>
<div class="secondary-controls"></div>
</div>
</div>
</div>
</div>
<div class="focus_grabber last"></div>
</div>
</div>
</div>
</div>
</div>
<div class="xblock xblock-public_view xblock-public_view-vertical" data-request-token="078e952803df11f0a0bf0affe2bbc7c1" data-usage-id="block-v1:MITx+6.036+1T2019+type@vertical+block@MIT6036L09h_vert" data-init="VerticalStudentView" data-block-type="vertical" data-runtime-class="LmsRuntime" data-course-id="course-v1:MITx+6.036+1T2019" data-has-score="False" data-graded="False" data-runtime-version="1">
<h2 class="hd hd-2 unit-title">Lecture: Neural networks and Q-learning</h2>
<div class="vert-mod">
<div class="vert vert-0" data-id="block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09h">
<div class="xblock xblock-public_view xblock-public_view-video xmodule_display xmodule_VideoBlock" data-request-token="078e952803df11f0a0bf0affe2bbc7c1" data-usage-id="block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09h" data-init="XBlockToXModuleShim" data-block-type="video" data-runtime-class="LmsRuntime" data-course-id="course-v1:MITx+6.036+1T2019" data-has-score="False" data-graded="False" data-runtime-version="1">
<script type="json/xblock-args" class="xblock-json-init-args">
{"xmodule-type": "Video"}
</script>
<h3 class="hd hd-2">Lecture: Neural networks and Q-learning</h3>
<div
id="video_MIT6036L09h"
class="video closed"
data-metadata='{"autoAdvance": false, "prioritizeHls": false, "recordedYoutubeIsAvailable": true, "ytTestTimeout": 1500, "poster": null, "streams": "1.00:bm6Nqrtsv8Q", "saveStateEnabled": false, "end": 0.0, "speed": null, "completionPercentage": 0.95, "start": 0.0, "publishCompletionUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09h/handler/publish_completion", "duration": 0.0, "autoplay": false, "savedVideoPosition": 0.0, "generalSpeed": 1.0, "autohideHtml5": false, "ytMetadataEndpoint": "", "transcriptTranslationUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09h/handler/transcript/translation/__lang__", "showCaptions": "true", "completionEnabled": false, "captionDataDir": null, "ytApiUrl": "https://www.youtube.com/iframe_api", "saveStateUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09h/handler/xmodule_handler/save_user_state", "transcriptAvailableTranslationsUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09h/handler/transcript/available_translations", "sources": [], "transcriptLanguages": {"en": "English"}, "transcriptLanguage": "en", "lmsRootURL": "https://openlearninglibrary.mit.edu"}'
data-bumper-metadata='null'
data-autoadvance-enabled="False"
data-poster='null'
tabindex="-1"
>
<div class="focus_grabber first"></div>
<div class="tc-wrapper">
<div class="video-wrapper">
<span tabindex="0" class="spinner" aria-hidden="false" aria-label="Loading video player"></span>
<span tabindex="-1" class="btn-play fa fa-youtube-play fa-2x is-hidden" aria-hidden="true" aria-label="Play video"></span>
<div class="video-player-pre"></div>
<div class="video-player">
<div id="MIT6036L09h"></div>
<h4 class="hd hd-4 video-error is-hidden">No playable video sources found.</h4>
<h4 class="hd hd-4 video-hls-error is-hidden">
Your browser does not support this video format. Try using a different browser.
</h4>
</div>
<div class="video-player-post"></div>
<div class="closed-captions"></div>
<div class="video-controls is-hidden">
<div>
<div class="vcr"><div class="vidtime">0:00 / 0:00</div></div>
<div class="secondary-controls"></div>
</div>
</div>
</div>
</div>
<div class="focus_grabber last"></div>
</div>
</div>
</div>
</div>
</div>
<div class="xblock xblock-public_view xblock-public_view-vertical" data-request-token="078e952803df11f0a0bf0affe2bbc7c1" data-usage-id="block-v1:MITx+6.036+1T2019+type@vertical+block@reinforcement_learning_sequential_problems_vert" data-init="VerticalStudentView" data-block-type="vertical" data-runtime-class="LmsRuntime" data-course-id="course-v1:MITx+6.036+1T2019" data-has-score="False" data-graded="False" data-runtime-version="1">
<h2 class="hd hd-2 unit-title">Sequential problems</h2>
<div class="vert-mod">
<div class="vert vert-0" data-id="block-v1:MITx+6.036+1T2019+type@html+block@reinforcement_learning_sequential_problems">
<div class="xblock xblock-public_view xblock-public_view-html xmodule_display xmodule_HtmlBlock" data-request-token="078e952803df11f0a0bf0affe2bbc7c1" data-usage-id="block-v1:MITx+6.036+1T2019+type@html+block@reinforcement_learning_sequential_problems" data-init="XBlockToXModuleShim" data-block-type="html" data-runtime-class="LmsRuntime" data-course-id="course-v1:MITx+6.036+1T2019" data-has-score="False" data-graded="False" data-runtime-version="1">
<script type="json/xblock-args" class="xblock-json-init-args">
{"xmodule-type": "HTMLModule"}
</script>
<p>
In the more typical (and difficult!) case, we can think of our learning agent interacting with an <i class="sc">mdp</i>, where it knows [mathjaxinline]\mathcal S[/mathjaxinline] and [mathjaxinline]\mathcal A[/mathjaxinline], but not [mathjaxinline]T(s,a,s')[/mathjaxinline] or [mathjaxinline]R(s,a)[/mathjaxinline]. The learner can interact with the environment by selecting actions. So, this is somewhat like a contextual bandit problem, but more complicated, because selecting an action influences not only what the immediate reward will be, but also what state the system ends up in at the next time step and, therefore, what additional rewards might be available in the future. </p><p>
A <em>reinforcement-learning (<i class="sc">rl</i>) algorithm</em> is a kind of a policy that depends on the whole history of states, actions, and rewards and selects the next action to take. There are several different ways to measure the quality of an <i class="sc">rl</i> algorithm, including: </p><ul class="itemize"><li><p>
Ignoring the [mathjaxinline]r_ t[/mathjaxinline] values that it gets <em>while</em> learning, but consider how many interactions with the environment are required for it to learn a policy [mathjaxinline]\pi : \mathcal{S} \rightarrow \mathcal{A}[/mathjaxinline] that is nearly optimal. </p></li><li><p>
Maximizing the expected discounted sum of total rewards while it is learning. </p></li></ul><p>
Most of the focus is on the first criterion, because the second one is very difficult. The first criterion is reasonable when the learning can take place somewhere safe (imagine a robot learning, inside the robot factory, where it can't hurt itself too badly) or in a simulated environment. </p><p>
Approaches to reinforcement-learning differ significantly according to what kind of hypothesis or model they learn. In the following sections, we will consider several different approaches. </p><p><h3>Model-based RL</h3> The conceptually simplest approach to <i class="sc">rl</i> is to estimate [mathjaxinline]R[/mathjaxinline] and [mathjaxinline]T[/mathjaxinline] from the data we have gotten so far, and then use those estimates, together with an algorithm for solving <i class="sc">mdp</i>s (such as value iteration) to find a policy that is near-optimal given the current model estimates. </p><p>
Assume that we have had some set of interactions with the environment, which can be characterized as a set of tuples of the form [mathjaxinline](s^{(t)}, a^{(t)}, r^{(t)}, s^{(t+1)}))[/mathjaxinline]. </p><p>
We can estimate [mathjaxinline]T(s,a,s')[/mathjaxinline] using a simple counting strategy, </p><table id="a0000000002" class="equation" width="100%" cellspacing="0" cellpadding="7" style="table-layout:auto"><tr><td class="equation" style="width:80%; border:none">[mathjax]\hat{T}(s,a,s') = \frac{\# (s,a,s') + 1}{\# (s,a) + \left| \mathcal{S}\right|}.[/mathjax]</td><td class="eqnnum" style="width:20%; border:none"> </td></tr></table><p>
Here, [mathjaxinline]\# (s, a, s')[/mathjaxinline] represents the number of times in our data set we have the situation where [mathjaxinline]s_ t = s, a_ t = a, s_{t+1} = s'[/mathjaxinline] and [mathjaxinline]\# (s, a)[/mathjaxinline] represents the number of times in our data set we have the situation where [mathjaxinline]s_ t = s, a_ t = a[/mathjaxinline]. <br/> <br/><span style="color:#FF0000"><b class="bf">Study Question:</b></span> <span style="color:#0000FF">Prove to yourself that [mathjaxinline]\# (s,a) = \sum _{s'} \# (s,a,s')[/mathjaxinline].</span> <br/></p><p>
Adding 1 and [mathjaxinline]\left|\mathcal{S}\right|[/mathjaxinline] to the numerator and denominator, respectively, are a form of smoothing called the <em>Laplace correction</em>. It ensures that we never estimate that a probability is 0, and keeps us from dividing by 0. As the amount of data we gather increases, the influence of this correction fades away. </p><p>
We also estimate the reward function [mathjaxinline]R(s,a)[/mathjaxinline]: </p><table id="a0000000003" class="equation" width="100%" cellspacing="0" cellpadding="7" style="table-layout:auto"><tr><td class="equation" style="width:80%; border:none">[mathjax]\hat{R}(s,a) = \frac{\sum r \mid s, a}{\# (s,a)}[/mathjax]</td><td class="eqnnum" style="width:20%; border:none"> </td></tr></table><p>
where </p><table id="a0000000004" class="equation" width="100%" cellspacing="0" cellpadding="7" style="table-layout:auto"><tr><td class="equation" style="width:80%; border:none">[mathjax]\sum r \mid s, a = \sum _{\{ t \mid s_ t = s, a_ t = a\} } r^{(t)}\; \; .[/mathjax]</td><td class="eqnnum" style="width:20%; border:none"> </td></tr></table><p>
This is just the average of the observed rewards for each [mathjaxinline]s, a[/mathjaxinline] pair. </p><p>
We can now solve the <i class="sc">mdp</i> [mathjaxinline](\mathcal S, \mathcal A, \hat{T}, \hat{R})[/mathjaxinline] to find an optimal policy using value iteration, or use a finite-depth expecti-max search to find an action to take for a particular state. </p><p>
This technique is effective for problems with small state and action spaces, where it is not too hard to get enough experience to estimate [mathjaxinline]T[/mathjaxinline] and [mathjaxinline]R[/mathjaxinline] well; but it is difficult to generalize this method to handle continuous (or very large discrete) state spaces, and is a topic of current research. </p><p><h3>Policy search</h3> A very different strategy is to search directly for a good policy, without first (or ever!) estimating the transition and reward models. The strategy here is to define a functional form [mathjaxinline]f(s;\theta ) = a[/mathjaxinline] for the policy, where [mathjaxinline]\theta[/mathjaxinline] represents the parameters we learn from experience. We choose [mathjaxinline]f[/mathjaxinline] to be differentiable, and often let [mathjaxinline]f(s;\theta ) = P(a)[/mathjaxinline], a probability distribution over our possible actions. </p><p>
Now, we can train the policy parameters using gradient descent: </p><ul class="itemize"><li><p>
When [mathjaxinline]\theta[/mathjaxinline] has relatively low dimension, we can compute a numeric estimate of the gradient by running the policy multiple times for [mathjaxinline]\theta \pm \epsilon[/mathjaxinline], and computing the resulting rewards. </p></li><li><p>
When [mathjaxinline]\theta[/mathjaxinline] has higher dimensions (e.g., it is a complicated neural network), there are more clever algorithms, e.g., one called <i class="sc">reinforce</i>, but they can often be difficult to get to work reliably. </p></li></ul><p>
Policy search is a good choice when the policy has a simple known form, but the model would be much more complicated to estimate. </p><p><h3>Value function learning</h3> The most popular class of algorithms learns neither explicit transition and reward models nor a direct policy, but instead concentrates on learning a value function. It is a topic of current research to describe exactly under what circumstances value-function-based approaches are best, and there are a growing number of methods that combine value functions, transition and reward models and policies into a complex learning algorithm in an attempt to combine the strengths of each approach. </p><p>
We will study two variations on value-function learning, both of which estimate the [mathjaxinline]Q[/mathjaxinline] function. </p><p><h4>Q-learning</h4></p><p>
This is the most typical way of performing reinforcement learning. Recall the value-iteration <span options="" class="marginote"><span class="marginote_desc" style="display:none">The thing that most students seem to get confused about is when we do value iteration and when we do Q learning. Value iteration assumes you know [mathjaxinline]T[/mathjaxinline] and [mathjaxinline]R[/mathjaxinline] and just need to <em>compute</em> [mathjaxinline]Q[/mathjaxinline]. In [mathjaxinline]Q[/mathjaxinline] learning, we don't know or even directly estimate [mathjaxinline]T[/mathjaxinline] and [mathjaxinline]R[/mathjaxinline]: we estimate [mathjaxinline]Q[/mathjaxinline] directly from experience!</span><span>update:</span></span> </p><table id="a0000000005" class="equation" width="100%" cellspacing="0" cellpadding="7" style="table-layout:auto"><tr><td class="equation" style="width:80%; border:none">[mathjax]Q(s,a) = R(s,a) + \gamma \sum _{s'} T(s,a,s')\max _{a'}Q(s',a')[/mathjax]</td><td class="eqnnum" style="width:20%; border:none"> </td></tr></table><p>
We will adapt this update to the <i class="sc">rl</i> scenario, where we do not know the transition function [mathjaxinline]T[/mathjaxinline] or reward function [mathjaxinline]R[/mathjaxinline]. </p><p><img src="/assets/courseware/v1/c68197d7cf5f3f6e5992a838c081d102/asset-v1:MITx+6.036+1T2019+type@asset+block/images_reinforcement_learning_sequential_problems_codebox_1-crop.png" width="577"/></p><p>
Here, [mathjaxinline]\alpha[/mathjaxinline] represents the “learning rate," which needs to decay for convergence purposes, but in practice is often set to a constant. </p><p>
Note that the update can be rewritten as </p><table id="a0000000006" class="equation" width="100%" cellspacing="0" cellpadding="7" style="table-layout:auto"><tr><td class="equation" style="width:80%; border:none">[mathjax]Q[s, a] \gets Q[s, a] - \alpha \left(Q[s,a] - (r + \gamma \max _{a'} Q[s',a'])\right)\, ,[/mathjax]</td><td class="eqnnum" style="width:20%; border:none"> </td></tr></table><p>
which looks something like <span options="" class="marginote"><span class="marginote_desc" style="display:none">It is actually not a gradient update, but later, when we consider function approximation, we will treat it as if it were.</span><span>a gradient update! </span></span> This is often called <em>temporal difference</em> learning method, because we make an update based on the difference between the current estimated value of taking action [mathjaxinline]a[/mathjaxinline] in state [mathjaxinline]s[/mathjaxinline], which is [mathjaxinline]Q[s, a][/mathjaxinline], and the “one-step" sampled value of taking [mathjaxinline]a[/mathjaxinline] in [mathjaxinline]s[/mathjaxinline], which is [mathjaxinline]r + \gamma \max _{a'} Q[s',a'][/mathjaxinline]. </p><p>
You can see this method as a combination of two different iterative processes that we have already seen: the combination of an old estimate with a new sample using a running average with a learning rate [mathjaxinline]\alpha[/mathjaxinline], and the dynamic-programming update of a [mathjaxinline]Q[/mathjaxinline] value from value iteration. </p><p>
Our algorithm above includes a procedure called <i class="it">select_action</i>, which, given the current state [mathjaxinline]s[/mathjaxinline], has to decide which action to take. If the [mathjaxinline]Q[/mathjaxinline] value is estimated very accurately and the agent is behaving in the world, then generally we would want to choose the apparently optimal action [mathjaxinline]{\rm arg}\max _{a \in \mathcal A} Q(s,a)[/mathjaxinline]. But, during learning, the [mathjaxinline]Q[/mathjaxinline] value estimates won't be very good and exploration is important. However, exploring completely at random is also usually not the best strategy while learning, because it is good to focus your attention on the parts of the state space that are likely to be visited when executing a good policy (not a stupid one). </p><p>
A typical action-selection strategy is the [mathjaxinline]\epsilon[/mathjaxinline]-greedy strategy: </p><ul class="itemize"><li><p>
with probability [mathjaxinline]1-\epsilon[/mathjaxinline], choose [mathjaxinline]{\rm arg}\max _{a \in \mathcal A} Q(s,a)[/mathjaxinline] </p></li><li><p>
with probability [mathjaxinline]\epsilon[/mathjaxinline], choose the action [mathjaxinline]a \in \mathcal A[/mathjaxinline] uniformly at random </p></li></ul><p>
Q-learning has the surprising property that it is <em>guaranteed</em> to converge to the actual optimal [mathjaxinline]Q[/mathjaxinline] function under fairly weak conditions! Any exploration strategy is okay as long as it tries every action infinitely often on an infinite run (so that it doesn't converge prematurely to a bad action choice). </p><p>
Q-learning can be very sample-inefficient: imagine a robot that has a choice between moving to the left and getting a reward of 1, then returning to its initial state, or moving to the right and walking down a 10-step hallway in order to get a reward of 1000, then returning to its initial state. </p><center><p><img src="/assets/courseware/v1/772289c9d0926be85afb89e8dc5a07af/asset-v1:MITx+6.036+1T2019+type@asset+block/images_reinforcement_learning_sequential_problems_tikzpicture_1-crop.png" width="848"/></p></center><p>
The first time the robot moves to the right and goes down the hallway, it will update the [mathjaxinline]Q[/mathjaxinline] value for the last state on the hallway to have a high value, but it won't yet understand that moving to the right was a good choice. The next time it moves down the hallway it updates the value of the state before the last one, and so on. After 10 trips down the hallway, it now can see that it is better to move to the right than to the left. </p><p>
More concretely, consider the vector of Q values [mathjaxinline]Q(0:10, \text { right})[/mathjaxinline], representing the Q values for moving right at each of the positions [mathjaxinline]0, \ldots , 9[/mathjaxinline]. Then, for [mathjaxinline]\alpha =1[/mathjaxinline] and [mathjaxinline]\gamma = 0.9[/mathjaxinline], </p><table id="a0000000007" class="equation" width="100%" cellspacing="0" cellpadding="7" style="table-layout:auto"><tr><td class="equation" style="width:80%; border:none">[mathjax]Q(i, \text { right}) = R(i, \text { right}) + 0.9 \cdot \max _ a Q(i+1, a)[/mathjax]</td><td class="eqnnum" style="width:20%; border:none"> </td></tr></table><p>
Starting with Q values of 0, </p><table id="a0000000008" class="equation" width="100%" cellspacing="0" cellpadding="7" style="table-layout:auto"><tr><td class="equation" style="width:80%; border:none">[mathjax]Q^{(0)}(0:10, \text { right}) = \begin{bmatrix} 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0\end{bmatrix}[/mathjax]</td><td class="eqnnum" style="width:20%; border:none"> </td></tr></table><p><span options="" class="marginote"><span class="marginote_desc" style="display:none">We are violating our usual notational conventions here, and writing [mathjaxinline]Q^{(i)}[/mathjaxinline] to mean the Q value function that results after the robot runs all the way to the end of the hallway, when executing the policy that always moves to the right.</span><span>note</span></span> Since the only nonzero reward from moving right is [mathjaxinline]R(9, \text { right}) = 1000[/mathjaxinline], after our robot makes it down the hallway once, our new Q vector is </p><table id="a0000000009" class="equation" width="100%" cellspacing="0" cellpadding="7" style="table-layout:auto"><tr><td class="equation" style="width:80%; border:none">[mathjax]Q^{(1)}(0:10, \text { right}) = \begin{bmatrix} 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 1000 & 0\end{bmatrix}[/mathjax]</td><td class="eqnnum" style="width:20%; border:none"> </td></tr></table><p>
After making its way down the hallway again, [mathjaxinline]Q(8, \text { right}) = 0 + 0.9 \cdot Q(9, \text { right}) = 900[/mathjaxinline] updates: </p><table id="a0000000010" class="equation" width="100%" cellspacing="0" cellpadding="7" style="table-layout:auto"><tr><td class="equation" style="width:80%; border:none">[mathjax]Q^{(2)}(0:10, \text { right}) = \begin{bmatrix} 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 900 & 1000 & 0 \end{bmatrix}[/mathjax]</td><td class="eqnnum" style="width:20%; border:none"> </td></tr></table><p>
Similarly, </p><table id="a0000000011" cellpadding="7" width="100%" cellspacing="0" class="eqnarray" style="table-layout:auto"><tr id="a0000000012"><td style="width:40%; border:none"> </td><td style="vertical-align:middle; text-align:right; border:none">
[mathjaxinline]\displaystyle Q^{(3)}(0:10, \text { right})[/mathjaxinline]
</td><td style="vertical-align:middle; text-align:left; border:none">
[mathjaxinline]\displaystyle = \begin{bmatrix} 0 & 0 & 0 & 0 & 0 & 0 & 0 & 810 & 900 & 1000 & 0 \end{bmatrix}[/mathjaxinline]
</td><td style="width:40%; border:none"> </td><td style="width:20%; border:none" class="eqnnum"> </td></tr><tr id="a0000000013"><td style="width:40%; border:none"> </td><td style="vertical-align:middle; text-align:right; border:none">
[mathjaxinline]\displaystyle Q^{(4)}(0:10, \text { right})[/mathjaxinline]
</td><td style="vertical-align:middle; text-align:left; border:none">
[mathjaxinline]\displaystyle = \begin{bmatrix} 0 & 0 & 0 & 0 & 0 & 0 & 729 & 810 & 900 & 1000 & 0 \end{bmatrix}[/mathjaxinline]
</td><td style="width:40%; border:none"> </td><td style="width:20%; border:none" class="eqnnum"> </td></tr><tr id="a0000000014"><td style="width:40%; border:none"> </td><td style="vertical-align:middle; text-align:right; border:none">
</td><td style="vertical-align:middle; text-align:left; border:none">
[mathjaxinline]\displaystyle \vdotswithin {=}[/mathjaxinline]
</td><td style="width:40%; border:none"> </td><td style="width:20%; border:none" class="eqnnum"> </td></tr><tr id="a0000000015"><td style="width:40%; border:none"> </td><td style="vertical-align:middle; text-align:right; border:none">
[mathjaxinline]\displaystyle Q^{(10)}(0:10, \text { right})[/mathjaxinline]
</td><td style="vertical-align:middle; text-align:left; border:none">
[mathjaxinline]\displaystyle = \begin{bmatrix} 387.4 & 420.5 & 478.3 & 531.4 & 590.5 & 656.1 & 729 & 810 & 900 & 1000 & 0 \end{bmatrix},[/mathjaxinline]
</td><td style="width:40%; border:none"> </td><td style="width:20%; border:none" class="eqnnum"> </td></tr></table><p>
and the robot finally sees the value of moving right from position 0. <span options="" class="marginote"><span class="marginote_desc" style="display:none">We can see how this interacts with the exploration/exploitation dilemma: from the perspective of [mathjaxinline]s_0[/mathjaxinline], it will seem, for a long time, that getting the immediate reward of [mathjaxinline]1[/mathjaxinline] is a better idea, and it would be easy to converge on that as a strategy without exploring the long hallway sufficiently.</span><span>note</span></span> <br/> <br/><span style="color:#FF0000"><b class="bf">Study Question:</b></span> <span style="color:#0000FF">Determine the Q value functions that will result from updates due to the robot always executing the “move left" policy.</span> <br/></p><p><h4>Function approximation</h4> In our Q-learning algorithm above, we essentially keep track of each [mathjaxinline]Q[/mathjaxinline] value in a table, indexed by [mathjaxinline]s[/mathjaxinline] and [mathjaxinline]a[/mathjaxinline]. What do we do if [mathjaxinline]\mathcal{S}[/mathjaxinline] and/or [mathjaxinline]\mathcal{A}[/mathjaxinline] are large (or continuous)? </p><p>
We can use a function approximator like a neural network to store Q values. For example, we could design a neural network that takes in inputs [mathjaxinline]s[/mathjaxinline] and [mathjaxinline]a[/mathjaxinline], and outputs [mathjaxinline]Q(s,a)[/mathjaxinline]. We can treat this as a regression problem, optimizing the squared Bellman error, with loss: </p><table id="a0000000016" class="equation" width="100%" cellspacing="0" cellpadding="7" style="table-layout:auto"><tr><td class="equation" style="width:80%; border:none">[mathjax]\left(Q(s,a) - (r + \gamma \max _{a'}Q(s',a'))\right)^2\; \; ,[/mathjax]</td><td class="eqnnum" style="width:20%; border:none"> </td></tr></table><p>
where [mathjaxinline]Q(s, a)[/mathjaxinline] is now the output of the neural network. </p><p>
There are actually several different architectural choices for using a neural network to approximate [mathjaxinline]Q[/mathjaxinline] values: </p><ul class="itemize"><li><p>
One network for each action [mathjaxinline]a_ j[/mathjaxinline], that takes [mathjaxinline]s[/mathjaxinline] as input and produces [mathjaxinline]Q(s, a_ j)[/mathjaxinline] as output; </p></li><li><p>
One single network that takes [mathjaxinline]s[/mathjaxinline] as input and produces a vector [mathjaxinline]Q(s, \cdot )[/mathjaxinline], consisting of the [mathjaxinline]Q[/mathjaxinline] values for each action; or </p></li><li><p>
One single network that takes [mathjaxinline]s, a[/mathjaxinline] concatenated into a vector (if [mathjaxinline]a[/mathjaxinline] is discrete, we would probably use a one-hot encoding, unless it had some useful internal structure) and produces [mathjaxinline]Q(s, a)[/mathjaxinline] as output. </p></li></ul><p><span options="" class="marginote"><span class="marginote_desc" style="display:none">For continuous action spaces, it is increasingly popular to use a class of methods called <em>actor-critic</em> methods, which combine policy and value-function learning. We won't get into them in detail here, though.</span><span>note</span></span></p><p>
The first two choices are only suitable for discrete (and not too big) action sets. The last choice can be applied for continuous actions, but then it is difficult to find [mathjaxinline]{\rm arg}\max _{\mathcal A} Q(s, a)[/mathjaxinline]. </p><p>
There are not many theoretical guarantees about Q-learning with function approximation and, indeed, it can sometimes be fairly unstable (learning to perform well for a while, and then getting suddenly worse, for example). But it has also had some significant successes. </p><p>
One form of instability that we do know how to guard against is <em>catastrophic forgetting.</em> In standard supervised learning, we expect that the training [mathjaxinline]x[/mathjaxinline] values were drawn independently <span options="" class="marginote"><span class="marginote_desc" style="display:none">And, in fact, we routinely shuffle their order in the data file, anyway.</span><span>from some distribution. </span></span> But when a learning agent, such as a robot, is moving through an environment, the sequence of states it encounters will <span options="" class="marginote"><span class="marginote_desc" style="display:none">For example, it might spend 12 hours in a dark environment and then 12 in a light one.</span><span>be temporally correlated. </span></span> This can mean that while it is in the dark, the neural-network weight-updates will make the [mathjaxinline]Q[/mathjaxinline] function “forget" the value function for when it's light. </p><p>
One way to handle this is to use [mathjaxinline]\emph{experience replay}[/mathjaxinline], where we save our [mathjaxinline](s,a,r,s')[/mathjaxinline] experiences in a <i class="it">replay buffer</i>. Whenever we take a step in the world, we add the [mathjaxinline](s,a,r,s')[/mathjaxinline] to the replay buffer and use it to do a Q-learning update. Then we also randomly select some number of tuples from the replay buffer, and do Q-learning updates based on them, as well. In general it may help to keep a <em>sliding window</em> of just the 1000 most recent experiences in the replay buffer. (A larger buffer will be necessary for situations when the optimal policy might visit a large part of the state space, but we like to keep the buffer size small for memory reasons and also so that we don't focus on parts of the state space that are irrelevant for the optimal policy.) The idea is that it will help you propagate reward values through your state space more efficiently if you do these updates. You can see it as doing something like value iteration, but using samples of experience rather than a known model. </p><p><h4>Fitted Q-learning</h4> An alternative strategy for learning the [mathjaxinline]Q[/mathjaxinline] function that is somewhat more robust than the standard [mathjaxinline]Q[/mathjaxinline]-learning algorithm is a method called <em>fitted Q</em>. </p><p><img src="/assets/courseware/v1/c06e75ee19fff52ca8aa3540b47c5c09/asset-v1:MITx+6.036+1T2019+type@asset+block/images_reinforcement_learning_sequential_problems_codebox_2-crop.png" width="836"/></p><p>
Here, we alternate between using the policy induced by the current [mathjaxinline]Q[/mathjaxinline] function to gather a batch of data [mathjaxinline]\mathcal D_\text {new}[/mathjaxinline], adding it to our overall data set [mathjaxinline]\mathcal D[/mathjaxinline], and then using supervised neural-network training to learn a representation of the [mathjaxinline]Q[/mathjaxinline] value function on the whole data set. This method does not mix the dynamic-programming phase (computing new [mathjaxinline]Q[/mathjaxinline] values based on old ones) with the function approximation phase (training the neural network) and avoids catastrophic forgetting. The regression training in line 9 typically uses squared error as a loss function and would be trained until the fit is good (possibly measured on held-out data). </p><p>
<br/></p><p>
<br/></p><p><a href="/assets/courseware/v1/153f87d9a11295896ffa8215253bf354/asset-v1:MITx+6.036+1T2019+type@asset+block/notes_chapter_Reinforcement_learning.pdf" target="_blank">Download this chapter as a PDF file</a></p><script src="/assets/courseware/v1/1ab2c06aefab58693cfc9c10394b7503/asset-v1:MITx+6.036+1T2019+type@asset+block/marginotes.js" type="text/javascript"/><span><br/><span style="color:gray;font-size:10pt"><center>This page was last updated on Friday May 24, 2019; 02:29:32 PM (revision 4f166135)</center></span></span>
</div>
</div>
</div>
</div>
<div class="xblock xblock-public_view xblock-public_view-vertical" data-request-token="078e952803df11f0a0bf0affe2bbc7c1" data-usage-id="block-v1:MITx+6.036+1T2019+type@vertical+block@MIT6036L09j_vert" data-init="VerticalStudentView" data-block-type="vertical" data-runtime-class="LmsRuntime" data-course-id="course-v1:MITx+6.036+1T2019" data-has-score="False" data-graded="False" data-runtime-version="1">
<h2 class="hd hd-2 unit-title">Lecture: Reinforcement learning demos</h2>
<div class="vert-mod">
<div class="vert vert-0" data-id="block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09j">
<div class="xblock xblock-public_view xblock-public_view-video xmodule_display xmodule_VideoBlock" data-request-token="078e952803df11f0a0bf0affe2bbc7c1" data-usage-id="block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09j" data-init="XBlockToXModuleShim" data-block-type="video" data-runtime-class="LmsRuntime" data-course-id="course-v1:MITx+6.036+1T2019" data-has-score="False" data-graded="False" data-runtime-version="1">
<script type="json/xblock-args" class="xblock-json-init-args">
{"xmodule-type": "Video"}
</script>
<h3 class="hd hd-2">Lecture: Reinforcement learning demos</h3>
<div
id="video_MIT6036L09j"
class="video closed"
data-metadata='{"autoAdvance": false, "prioritizeHls": false, "recordedYoutubeIsAvailable": true, "ytTestTimeout": 1500, "poster": null, "streams": "1.00:t6aKxyaE4s4", "saveStateEnabled": false, "end": 0.0, "speed": null, "completionPercentage": 0.95, "start": 0.0, "publishCompletionUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09j/handler/publish_completion", "duration": 0.0, "autoplay": false, "savedVideoPosition": 0.0, "generalSpeed": 1.0, "autohideHtml5": false, "ytMetadataEndpoint": "", "transcriptTranslationUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09j/handler/transcript/translation/__lang__", "showCaptions": "true", "completionEnabled": false, "captionDataDir": null, "ytApiUrl": "https://www.youtube.com/iframe_api", "saveStateUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09j/handler/xmodule_handler/save_user_state", "transcriptAvailableTranslationsUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L09j/handler/transcript/available_translations", "sources": [], "transcriptLanguages": {"en": "English"}, "transcriptLanguage": "en", "lmsRootURL": "https://openlearninglibrary.mit.edu"}'
data-bumper-metadata='null'
data-autoadvance-enabled="False"
data-poster='null'
tabindex="-1"
>
<div class="focus_grabber first"></div>
<div class="tc-wrapper">
<div class="video-wrapper">
<span tabindex="0" class="spinner" aria-hidden="false" aria-label="Loading video player"></span>
<span tabindex="-1" class="btn-play fa fa-youtube-play fa-2x is-hidden" aria-hidden="true" aria-label="Play video"></span>
<div class="video-player-pre"></div>
<div class="video-player">
<div id="MIT6036L09j"></div>
<h4 class="hd hd-4 video-error is-hidden">No playable video sources found.</h4>
<h4 class="hd hd-4 video-hls-error is-hidden">
Your browser does not support this video format. Try using a different browser.
</h4>
</div>
<div class="video-player-post"></div>
<div class="closed-captions"></div>
<div class="video-controls is-hidden">
<div>
<div class="vcr"><div class="vidtime">0:00 / 0:00</div></div>
<div class="secondary-controls"></div>
</div>
</div>
</div>
</div>
<div class="focus_grabber last"></div>
</div>
</div>
</div>
</div>
</div>
© All Rights Reserved