<div class="xblock xblock-public_view xblock-public_view-vertical" data-block-type="vertical" data-has-score="False" data-runtime-version="1" data-course-id="course-v1:MITx+6.036+1T2019" data-init="VerticalStudentView" data-request-token="60c19cfe03ed11f099b802fa081815af" data-usage-id="block-v1:MITx+6.036+1T2019+type@vertical+block@sequential_models_notes" data-graded="False" data-runtime-class="LmsRuntime">
<h2 class="hd hd-2 unit-title">Notes – Chapter 10: Sequential models</h2>
<div class="vert-mod">
<div class="vert vert-0" data-id="block-v1:MITx+6.036+1T2019+type@html+block@sequential_models_notes_top">
<div class="xblock xblock-public_view xblock-public_view-html xmodule_display xmodule_HtmlBlock" data-block-type="html" data-has-score="False" data-runtime-version="1" data-course-id="course-v1:MITx+6.036+1T2019" data-init="XBlockToXModuleShim" data-request-token="60c19cfe03ed11f099b802fa081815af" data-usage-id="block-v1:MITx+6.036+1T2019+type@html+block@sequential_models_notes_top" data-graded="False" data-runtime-class="LmsRuntime">
<script type="json/xblock-args" class="xblock-json-init-args">
{"xmodule-type": "HTMLModule"}
</script>
<p>
You can sequence through the Sequential Models lecture video and note segments (go to Next page). </p><p>
You can also (or alternatively) download the <a href="/assets/courseware/v1/caee9a9ff60ed183e4485869c2e88ac4/asset-v1:MITx+6.036+1T2019+type@asset+block/notes_chapter_Sequential_models.pdf" target="_blank">Chapter 10: Sequential models</a> notes as a PDF file. </p>
</div>
</div>
</div>
</div>
<div class="xblock xblock-public_view xblock-public_view-vertical" data-block-type="vertical" data-has-score="False" data-runtime-version="1" data-course-id="course-v1:MITx+6.036+1T2019" data-init="VerticalStudentView" data-request-token="60c19cfe03ed11f099b802fa081815af" data-usage-id="block-v1:MITx+6.036+1T2019+type@vertical+block@MIT6036L08a_vert" data-graded="False" data-runtime-class="LmsRuntime">
<h2 class="hd hd-2 unit-title">Lecture: Sequential models - state machines</h2>
<div class="vert-mod">
<div class="vert vert-0" data-id="block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08a">
<div class="xblock xblock-public_view xblock-public_view-video xmodule_display xmodule_VideoBlock" data-block-type="video" data-has-score="False" data-runtime-version="1" data-course-id="course-v1:MITx+6.036+1T2019" data-init="XBlockToXModuleShim" data-request-token="60c19cfe03ed11f099b802fa081815af" data-usage-id="block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08a" data-graded="False" data-runtime-class="LmsRuntime">
<script type="json/xblock-args" class="xblock-json-init-args">
{"xmodule-type": "Video"}
</script>
<h3 class="hd hd-2">Lecture: Sequential models - state machines</h3>
<div
id="video_MIT6036L08a"
class="video closed"
data-metadata='{"saveStateEnabled": false, "autoplay": false, "publishCompletionUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08a/handler/publish_completion", "start": 0.0, "prioritizeHls": false, "saveStateUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08a/handler/xmodule_handler/save_user_state", "recordedYoutubeIsAvailable": true, "streams": "1.00:tocmxrAmfAs", "autoAdvance": false, "transcriptAvailableTranslationsUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08a/handler/transcript/available_translations", "captionDataDir": null, "ytMetadataEndpoint": "", "showCaptions": "true", "lmsRootURL": "https://openlearninglibrary.mit.edu", "transcriptTranslationUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08a/handler/transcript/translation/__lang__", "ytApiUrl": "https://www.youtube.com/iframe_api", "transcriptLanguages": {"en": "English"}, "speed": null, "autohideHtml5": false, "generalSpeed": 1.0, "transcriptLanguage": "en", "savedVideoPosition": 0.0, "poster": null, "sources": [], "duration": 0.0, "end": 0.0, "completionEnabled": false, "completionPercentage": 0.95, "ytTestTimeout": 1500}'
data-bumper-metadata='null'
data-autoadvance-enabled="False"
data-poster='null'
tabindex="-1"
>
<div class="focus_grabber first"></div>
<div class="tc-wrapper">
<div class="video-wrapper">
<span tabindex="0" class="spinner" aria-hidden="false" aria-label="Loading video player"></span>
<span tabindex="-1" class="btn-play fa fa-youtube-play fa-2x is-hidden" aria-hidden="true" aria-label="Play video"></span>
<div class="video-player-pre"></div>
<div class="video-player">
<div id="MIT6036L08a"></div>
<h4 class="hd hd-4 video-error is-hidden">No playable video sources found.</h4>
<h4 class="hd hd-4 video-hls-error is-hidden">
Your browser does not support this video format. Try using a different browser.
</h4>
</div>
<div class="video-player-post"></div>
<div class="closed-captions"></div>
<div class="video-controls is-hidden">
<div>
<div class="vcr"><div class="vidtime">0:00 / 0:00</div></div>
<div class="secondary-controls"></div>
</div>
</div>
</div>
</div>
<div class="focus_grabber last"></div>
</div>
</div>
</div>
</div>
</div>
<div class="xblock xblock-public_view xblock-public_view-vertical" data-block-type="vertical" data-has-score="False" data-runtime-version="1" data-course-id="course-v1:MITx+6.036+1T2019" data-init="VerticalStudentView" data-request-token="60c19cfe03ed11f099b802fa081815af" data-usage-id="block-v1:MITx+6.036+1T2019+type@vertical+block@state_machines_top_vert" data-graded="False" data-runtime-class="LmsRuntime">
<h2 class="hd hd-2 unit-title">Introduction to sequential models</h2>
<div class="vert-mod">
<div class="vert vert-0" data-id="block-v1:MITx+6.036+1T2019+type@html+block@state_machines_top">
<div class="xblock xblock-public_view xblock-public_view-html xmodule_display xmodule_HtmlBlock" data-block-type="html" data-has-score="False" data-runtime-version="1" data-course-id="course-v1:MITx+6.036+1T2019" data-init="XBlockToXModuleShim" data-request-token="60c19cfe03ed11f099b802fa081815af" data-usage-id="block-v1:MITx+6.036+1T2019+type@html+block@state_machines_top" data-graded="False" data-runtime-class="LmsRuntime">
<script type="json/xblock-args" class="xblock-json-init-args">
{"xmodule-type": "HTMLModule"}
</script>
<p>
So far, we have limited our attention to domains in which each output [mathjaxinline]y[/mathjaxinline] is assumed to have been generated as a function of an associated input [mathjaxinline]x[/mathjaxinline], and our hypotheses have been “pure" functions, in which the output depends only on the input (and the parameters we have learned that govern the function's behavior). In the next few weeks, we are going to consider cases in which our models need to go beyond functions. </p><ul class="itemize"><li><p>
In <em>recurrent neural networks</em>, the hypothesis that we learn is not a function of a single input, but of the whole sequence of inputs that the predictor has received. </p></li><li><p>
In <em>reinforcement learning</em>, the hypothesis is either a <em>model</em> of a domain (such as a game) as a recurrent system or a <em>policy</em> which is a pure function, but whose loss is determined by the ways in which the policy interacts with the domain over time. </p></li></ul><p>
Before we engage with those forms of learning, we will study models of sequential or recurrent systems that underlie the learning methods. </p><p>
<br/></p><p>
<br/></p><p><a href="/assets/courseware/v1/caee9a9ff60ed183e4485869c2e88ac4/asset-v1:MITx+6.036+1T2019+type@asset+block/notes_chapter_Sequential_models.pdf" target="_blank">Download this chapter as a PDF file</a></p><span><br/><span style="color:gray;font-size:10pt"><center>This page was last updated on Thursday December 12, 2019; 09:33:57 PM (revision 4b592d7d7)</center></span></span>
</div>
</div>
</div>
</div>
<div class="xblock xblock-public_view xblock-public_view-vertical" data-block-type="vertical" data-has-score="False" data-runtime-version="1" data-course-id="course-v1:MITx+6.036+1T2019" data-init="VerticalStudentView" data-request-token="60c19cfe03ed11f099b802fa081815af" data-usage-id="block-v1:MITx+6.036+1T2019+type@vertical+block@MIT6036L08b_vert" data-graded="False" data-runtime-class="LmsRuntime">
<h2 class="hd hd-2 unit-title">Lecture: SM - State machine as a transducer</h2>
<div class="vert-mod">
<div class="vert vert-0" data-id="block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08b">
<div class="xblock xblock-public_view xblock-public_view-video xmodule_display xmodule_VideoBlock" data-block-type="video" data-has-score="False" data-runtime-version="1" data-course-id="course-v1:MITx+6.036+1T2019" data-init="XBlockToXModuleShim" data-request-token="60c19cfe03ed11f099b802fa081815af" data-usage-id="block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08b" data-graded="False" data-runtime-class="LmsRuntime">
<script type="json/xblock-args" class="xblock-json-init-args">
{"xmodule-type": "Video"}
</script>
<h3 class="hd hd-2">Lecture: SM - State machine as a transducer</h3>
<div
id="video_MIT6036L08b"
class="video closed"
data-metadata='{"saveStateEnabled": false, "autoplay": false, "publishCompletionUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08b/handler/publish_completion", "start": 0.0, "prioritizeHls": false, "saveStateUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08b/handler/xmodule_handler/save_user_state", "recordedYoutubeIsAvailable": true, "streams": "1.00:vohmh6XIx1c", "autoAdvance": false, "transcriptAvailableTranslationsUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08b/handler/transcript/available_translations", "captionDataDir": null, "ytMetadataEndpoint": "", "showCaptions": "true", "lmsRootURL": "https://openlearninglibrary.mit.edu", "transcriptTranslationUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08b/handler/transcript/translation/__lang__", "ytApiUrl": "https://www.youtube.com/iframe_api", "transcriptLanguages": {"en": "English"}, "speed": null, "autohideHtml5": false, "generalSpeed": 1.0, "transcriptLanguage": "en", "savedVideoPosition": 0.0, "poster": null, "sources": [], "duration": 0.0, "end": 0.0, "completionEnabled": false, "completionPercentage": 0.95, "ytTestTimeout": 1500}'
data-bumper-metadata='null'
data-autoadvance-enabled="False"
data-poster='null'
tabindex="-1"
>
<div class="focus_grabber first"></div>
<div class="tc-wrapper">
<div class="video-wrapper">
<span tabindex="0" class="spinner" aria-hidden="false" aria-label="Loading video player"></span>
<span tabindex="-1" class="btn-play fa fa-youtube-play fa-2x is-hidden" aria-hidden="true" aria-label="Play video"></span>
<div class="video-player-pre"></div>
<div class="video-player">
<div id="MIT6036L08b"></div>
<h4 class="hd hd-4 video-error is-hidden">No playable video sources found.</h4>
<h4 class="hd hd-4 video-hls-error is-hidden">
Your browser does not support this video format. Try using a different browser.
</h4>
</div>
<div class="video-player-post"></div>
<div class="closed-captions"></div>
<div class="video-controls is-hidden">
<div>
<div class="vcr"><div class="vidtime">0:00 / 0:00</div></div>
<div class="secondary-controls"></div>
</div>
</div>
</div>
</div>
<div class="focus_grabber last"></div>
</div>
</div>
</div>
</div>
</div>
<div class="xblock xblock-public_view xblock-public_view-vertical" data-block-type="vertical" data-has-score="False" data-runtime-version="1" data-course-id="course-v1:MITx+6.036+1T2019" data-init="VerticalStudentView" data-request-token="60c19cfe03ed11f099b802fa081815af" data-usage-id="block-v1:MITx+6.036+1T2019+type@vertical+block@MIT6036L08c_vert" data-graded="False" data-runtime-class="LmsRuntime">
<h2 class="hd hd-2 unit-title">Lecture: SM - Towards recurrent neural networks</h2>
<div class="vert-mod">
<div class="vert vert-0" data-id="block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08c">
<div class="xblock xblock-public_view xblock-public_view-video xmodule_display xmodule_VideoBlock" data-block-type="video" data-has-score="False" data-runtime-version="1" data-course-id="course-v1:MITx+6.036+1T2019" data-init="XBlockToXModuleShim" data-request-token="60c19cfe03ed11f099b802fa081815af" data-usage-id="block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08c" data-graded="False" data-runtime-class="LmsRuntime">
<script type="json/xblock-args" class="xblock-json-init-args">
{"xmodule-type": "Video"}
</script>
<h3 class="hd hd-2">Lecture: SM - Towards recurrent neural networks</h3>
<div
id="video_MIT6036L08c"
class="video closed"
data-metadata='{"saveStateEnabled": false, "autoplay": false, "publishCompletionUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08c/handler/publish_completion", "start": 0.0, "prioritizeHls": false, "saveStateUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08c/handler/xmodule_handler/save_user_state", "recordedYoutubeIsAvailable": true, "streams": "1.00:iWBtDjw8wqw", "autoAdvance": false, "transcriptAvailableTranslationsUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08c/handler/transcript/available_translations", "captionDataDir": null, "ytMetadataEndpoint": "", "showCaptions": "true", "lmsRootURL": "https://openlearninglibrary.mit.edu", "transcriptTranslationUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08c/handler/transcript/translation/__lang__", "ytApiUrl": "https://www.youtube.com/iframe_api", "transcriptLanguages": {"en": "English"}, "speed": null, "autohideHtml5": false, "generalSpeed": 1.0, "transcriptLanguage": "en", "savedVideoPosition": 0.0, "poster": null, "sources": [], "duration": 0.0, "end": 0.0, "completionEnabled": false, "completionPercentage": 0.95, "ytTestTimeout": 1500}'
data-bumper-metadata='null'
data-autoadvance-enabled="False"
data-poster='null'
tabindex="-1"
>
<div class="focus_grabber first"></div>
<div class="tc-wrapper">
<div class="video-wrapper">
<span tabindex="0" class="spinner" aria-hidden="false" aria-label="Loading video player"></span>
<span tabindex="-1" class="btn-play fa fa-youtube-play fa-2x is-hidden" aria-hidden="true" aria-label="Play video"></span>
<div class="video-player-pre"></div>
<div class="video-player">
<div id="MIT6036L08c"></div>
<h4 class="hd hd-4 video-error is-hidden">No playable video sources found.</h4>
<h4 class="hd hd-4 video-hls-error is-hidden">
Your browser does not support this video format. Try using a different browser.
</h4>
</div>
<div class="video-player-post"></div>
<div class="closed-captions"></div>
<div class="video-controls is-hidden">
<div>
<div class="vcr"><div class="vidtime">0:00 / 0:00</div></div>
<div class="secondary-controls"></div>
</div>
</div>
</div>
</div>
<div class="focus_grabber last"></div>
</div>
</div>
</div>
</div>
</div>
<div class="xblock xblock-public_view xblock-public_view-vertical" data-block-type="vertical" data-has-score="False" data-runtime-version="1" data-course-id="course-v1:MITx+6.036+1T2019" data-init="VerticalStudentView" data-request-token="60c19cfe03ed11f099b802fa081815af" data-usage-id="block-v1:MITx+6.036+1T2019+type@vertical+block@state_machines_state_machines_vert" data-graded="False" data-runtime-class="LmsRuntime">
<h2 class="hd hd-2 unit-title">State machines</h2>
<div class="vert-mod">
<div class="vert vert-0" data-id="block-v1:MITx+6.036+1T2019+type@html+block@state_machines_state_machines">
<div class="xblock xblock-public_view xblock-public_view-html xmodule_display xmodule_HtmlBlock" data-block-type="html" data-has-score="False" data-runtime-version="1" data-course-id="course-v1:MITx+6.036+1T2019" data-init="XBlockToXModuleShim" data-request-token="60c19cfe03ed11f099b802fa081815af" data-usage-id="block-v1:MITx+6.036+1T2019+type@html+block@state_machines_state_machines" data-graded="False" data-runtime-class="LmsRuntime">
<script type="json/xblock-args" class="xblock-json-init-args">
{"xmodule-type": "HTMLModule"}
</script>
<p>
A <span options="" class="marginote"><span class="marginote_desc" style="display:none">This is such a pervasive idea that it has been given many names in many subareas of computer science, control theory, physics, etc., including: <em>automaton</em>, <i class="it">transducer</i>, <i class="it">dynamical system</i>, <i class="it">system</i>, etc.</span><span><em>state machine</em></span></span> is a description of a process (computational, physical, economic) in terms of its potential sequences of <em>states</em>. </p><p>
The <em>state</em> of a system is defined to be all you would need to know about the system to predict its future trajectories as well as possible. It could be the position and velocity of an object or the locations of your pieces on a game board, or the current traffic densities on a highway network. </p><p>
Formally, we define a <em>state machine</em> <span options="" class="marginote"><span class="marginote_desc" style="display:none">There are a huge number of major and minor variations on the idea of a state machine. We'll just work with one specific one in this section and another one in the next, but don't worry if you see other variations out in the world!</span><span>as </span></span> [mathjaxinline](\mathcal{S}, \mathcal{X}, \mathcal{Y}, s_0, f, g)[/mathjaxinline] where </p><ul class="itemize"><li><p>
[mathjaxinline]\mathcal{S}[/mathjaxinline] is a finite or infinite set of possible states; </p></li><li><p>
[mathjaxinline]\mathcal{X}[/mathjaxinline] is a finite or infinite set of possible inputs; </p></li><li><p>
[mathjaxinline]\mathcal{Y}[/mathjaxinline] is a finite or infinite set of possible outputs; </p></li><li><p>
[mathjaxinline]s_0 \in \mathcal{S}[/mathjaxinline] is the initial state of the machine; </p></li><li><p>
[mathjaxinline]f: \mathcal{S} \times \mathcal{X} \rightarrow \mathcal{S}[/mathjaxinline] is a <em>transition function</em>, which takes an input and a previous state and produces a next state; </p></li><li><p>
[mathjaxinline]g: \mathcal{S} \rightarrow \mathcal{Y}[/mathjaxinline] is an <em>output function</em>, which takes a state and produces an output. </p></li></ul><p>
The basic operation of the state <span options="" class="marginote"><span class="marginote_desc" style="display:none">In some cases, we will pick a starting state from a set or distribution.</span><span>machine is to </span></span> start with state [mathjaxinline]s_0[/mathjaxinline], then iteratively compute: </p><table id="a0000000002" cellpadding="7" width="100%" cellspacing="0" class="eqnarray" style="table-layout:auto"><tr id="a0000000003"><td style="width:40%; border:none"> </td><td style="vertical-align:middle; text-align:right; border:none">
[mathjaxinline]\displaystyle s_ t[/mathjaxinline]
</td><td style="vertical-align:middle; text-align:left; border:none">
[mathjaxinline]\displaystyle = f(s_{t - 1}, x_ t)[/mathjaxinline]
</td><td style="width:40%; border:none"> </td><td style="width:20%; border:none" class="eqnnum"> </td></tr><tr id="a0000000004"><td style="width:40%; border:none"> </td><td style="vertical-align:middle; text-align:right; border:none">
[mathjaxinline]\displaystyle y_ t[/mathjaxinline]
</td><td style="vertical-align:middle; text-align:left; border:none">
[mathjaxinline]\displaystyle = g(s_ t)[/mathjaxinline]
</td><td style="width:40%; border:none"> </td><td style="width:20%; border:none" class="eqnnum"> </td></tr></table><p><div style="border-radius:10px;padding:5px;border-style:solid;background-color:rgba(0,255,0,0.03);" class="examplebox"><p>
The diagram below illustrates this process. Note that the “feedback" connection of [mathjaxinline]s_ t[/mathjaxinline] back into [mathjaxinline]f[/mathjaxinline] has to be buffered or delayed by one time step—-otherwise what it is computing would not generally be well defined. </p><center><p>
block = [draw, fill=blue!20, rectangle, minimum height=3em, minimum width=3em] sum = [draw, fill=blue!20, circle, node distance=1cm] input = [coordinate] output = [coordinate] pinstyle = [pin edge=to-,thin,black] </p><p><img src="/assets/courseware/v1/ec3f3fb77920eb6418b33bc7a49d647e/asset-v1:MITx+6.036+1T2019+type@asset+block/images_state_machines_state_machines_tikzpicture_1-crop.png" width="520"/></p></center></div> So, given a sequence of inputs [mathjaxinline]x_1, x_2, \dots[/mathjaxinline] the machine generates a sequence of outputs </p><table id="a0000000005" class="equation" width="100%" cellspacing="0" cellpadding="7" style="table-layout:auto"><tr><td class="equation" style="width:80%; border:none">[mathjax]\underbrace{g(f(x_1, s_0))}_{y_1}, \underbrace{g(f(x_2, f(x_1, s_0)))}_{y_2}, \dots \; \; .[/mathjax]</td><td class="eqnnum" style="width:20%; border:none"> </td></tr></table><p>
We sometimes say that the machine <em>transduces</em> sequence [mathjaxinline]x[/mathjaxinline] into sequence [mathjaxinline]y[/mathjaxinline]. The output at time [mathjaxinline]t[/mathjaxinline] can have dependence on inputs from steps [mathjaxinline]1[/mathjaxinline] to [mathjaxinline]t[/mathjaxinline]. </p><p>
One common form is <em>finite state machines</em>, in which [mathjaxinline]\mathcal S[/mathjaxinline], [mathjaxinline]\mathcal X[/mathjaxinline], and [mathjaxinline]\mathcal Y[/mathjaxinline] are all finite sets. They are often described using <em>state transition diagrams</em> such as the one below, in which nodes stand for states and arcs indicate transitions. Nodes are labeled by which output they generate and arcs are labeled by which input <span options="" class="marginote"><span class="marginote_desc" style="display:none">All computers can be described, at the digital level, as finite state machines. Big, but finite!</span><span>causes the transition. </span></span> </p><p><div style="border-radius:10px;padding:5px;border-style:solid;background-color:rgba(0,255,0,0.03);" class="examplebox"><p>
One can verify that the state machine below reads binary strings and determines the parity of the number of zeros in the given string. Check for yourself that all inputted binary strings end in state [mathjaxinline]S_1[/mathjaxinline] if and only if they contain an even number of zeros. </p><center><img src="/assets/courseware/v1/5bae6aecfb4092445bd1e135d9a41005/asset-v1:MITx+6.036+1T2019+type@asset+block/images_FSM.png" width="400" style="scale:1.0"/></center></div></p><p>
Another common structure that is simple but powerful and used in signal processing and control is <em>linear time-invariant (LTI) systems</em>. In this case, [mathjaxinline]\mathcal S = \mathbb {R}^ m[/mathjaxinline], [mathjaxinline]\mathcal X = \mathbb {R}^ l[/mathjaxinline] and [mathjaxinline]\mathcal Y = \mathbb {R}^ n[/mathjaxinline], and [mathjaxinline]f[/mathjaxinline] and [mathjaxinline]g[/mathjaxinline] are linear functions of their inputs. In discrete time, they can be defined by a linear difference equation, like </p><table id="a0000000006" class="equation" width="100%" cellspacing="0" cellpadding="7" style="table-layout:auto"><tr><td class="equation" style="width:80%; border:none">[mathjax]y[t] = 3y[t - 1] + 6y[t - 2] + 5x[t] + 3x[t - 2] \; \; ,[/mathjax]</td><td class="eqnnum" style="width:20%; border:none"> </td></tr></table><p>
(where [mathjaxinline]y[t][/mathjaxinline] is [mathjaxinline]y[/mathjaxinline] at time [mathjaxinline]t[/mathjaxinline]) and can be implemented using state to store relevant previous input and output information. </p><p>
We will study <i class="it">recurrent neural networks</i> which are a lot like a non-linear version of an LTI system, with transition and output functions </p><table id="a0000000007" cellpadding="7" width="100%" cellspacing="0" class="eqnarray" style="table-layout:auto"><tr id="a0000000008"><td style="width:40%; border:none"> </td><td style="vertical-align:middle; text-align:right; border:none">
[mathjaxinline]\displaystyle f(s, x)[/mathjaxinline]
</td><td style="vertical-align:middle; text-align:left; border:none">
[mathjaxinline]\displaystyle = f_1(W^{sx}x + W^{ss}s + W^{ss}_0)[/mathjaxinline]
</td><td style="width:40%; border:none"> </td><td style="width:20%; border:none" class="eqnnum"> </td></tr><tr id="a0000000009"><td style="width:40%; border:none"> </td><td style="vertical-align:middle; text-align:right; border:none">
[mathjaxinline]\displaystyle g(s)[/mathjaxinline]
</td><td style="vertical-align:middle; text-align:left; border:none">
[mathjaxinline]\displaystyle = f_2(W^0s + W^0_0)[/mathjaxinline]
</td><td style="width:40%; border:none"> </td><td style="width:20%; border:none" class="eqnnum"> </td></tr></table><p>
defined by weight matrices </p><table id="a0000000010" cellpadding="7" width="100%" cellspacing="0" class="eqnarray" style="table-layout:auto"><tr id="a0000000011"><td style="width:40%; border:none"> </td><td style="vertical-align:middle; text-align:right; border:none">
[mathjaxinline]\displaystyle W^{sx}[/mathjaxinline]
</td><td style="vertical-align:middle; text-align:left; border:none">
[mathjaxinline]\displaystyle : m \times \ell[/mathjaxinline]
</td><td style="width:40%; border:none"> </td><td style="width:20%; border:none" class="eqnnum"> </td></tr><tr id="a0000000012"><td style="width:40%; border:none"> </td><td style="vertical-align:middle; text-align:right; border:none">
[mathjaxinline]\displaystyle W^{ss}[/mathjaxinline]
</td><td style="vertical-align:middle; text-align:left; border:none">
[mathjaxinline]\displaystyle : m \times m[/mathjaxinline]
</td><td style="width:40%; border:none"> </td><td style="width:20%; border:none" class="eqnnum"> </td></tr><tr id="a0000000013"><td style="width:40%; border:none"> </td><td style="vertical-align:middle; text-align:right; border:none">
[mathjaxinline]\displaystyle W^{ss}_0[/mathjaxinline]
</td><td style="vertical-align:middle; text-align:left; border:none">
[mathjaxinline]\displaystyle : m \times 1[/mathjaxinline]
</td><td style="width:40%; border:none"> </td><td style="width:20%; border:none" class="eqnnum"> </td></tr><tr id="a0000000014"><td style="width:40%; border:none"> </td><td style="vertical-align:middle; text-align:right; border:none">
[mathjaxinline]\displaystyle W^{0}[/mathjaxinline]
</td><td style="vertical-align:middle; text-align:left; border:none">
[mathjaxinline]\displaystyle : n \times m[/mathjaxinline]
</td><td style="width:40%; border:none"> </td><td style="width:20%; border:none" class="eqnnum"> </td></tr><tr id="a0000000015"><td style="width:40%; border:none"> </td><td style="vertical-align:middle; text-align:right; border:none">
[mathjaxinline]\displaystyle W^{0}_0[/mathjaxinline]
</td><td style="vertical-align:middle; text-align:left; border:none">
[mathjaxinline]\displaystyle : n \times 1[/mathjaxinline]
</td><td style="width:40%; border:none"> </td><td style="width:20%; border:none" class="eqnnum"> </td></tr></table><p>
and activation functions [mathjaxinline]f_1[/mathjaxinline] and [mathjaxinline]f_2[/mathjaxinline]. We will see that it's actually possible to learn weight values for a recurrent neural network using gradient descent. </p><p>
<br/></p><p>
<br/></p><p><a href="/assets/courseware/v1/caee9a9ff60ed183e4485869c2e88ac4/asset-v1:MITx+6.036+1T2019+type@asset+block/notes_chapter_Sequential_models.pdf" target="_blank">Download this chapter as a PDF file</a></p><script src="/assets/courseware/v1/1ab2c06aefab58693cfc9c10394b7503/asset-v1:MITx+6.036+1T2019+type@asset+block/marginotes.js" type="text/javascript"/><span><br/><span style="color:gray;font-size:10pt"><center>This page was last updated on Friday May 24, 2019; 02:29:23 PM (revision 4f166135)</center></span></span>
</div>
</div>
</div>
</div>
<div class="xblock xblock-public_view xblock-public_view-vertical" data-block-type="vertical" data-has-score="False" data-runtime-version="1" data-course-id="course-v1:MITx+6.036+1T2019" data-init="VerticalStudentView" data-request-token="60c19cfe03ed11f099b802fa081815af" data-usage-id="block-v1:MITx+6.036+1T2019+type@vertical+block@MIT6036L08d_vert" data-graded="False" data-runtime-class="LmsRuntime">
<h2 class="hd hd-2 unit-title">Lecture: SM - Markov decision processes - states and actions</h2>
<div class="vert-mod">
<div class="vert vert-0" data-id="block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08d">
<div class="xblock xblock-public_view xblock-public_view-video xmodule_display xmodule_VideoBlock" data-block-type="video" data-has-score="False" data-runtime-version="1" data-course-id="course-v1:MITx+6.036+1T2019" data-init="XBlockToXModuleShim" data-request-token="60c19cfe03ed11f099b802fa081815af" data-usage-id="block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08d" data-graded="False" data-runtime-class="LmsRuntime">
<script type="json/xblock-args" class="xblock-json-init-args">
{"xmodule-type": "Video"}
</script>
<h3 class="hd hd-2">Lecture: SM - Markov decision processes - states and actions</h3>
<div
id="video_MIT6036L08d"
class="video closed"
data-metadata='{"saveStateEnabled": false, "autoplay": false, "publishCompletionUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08d/handler/publish_completion", "start": 0.0, "prioritizeHls": false, "saveStateUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08d/handler/xmodule_handler/save_user_state", "recordedYoutubeIsAvailable": true, "streams": "1.00:rxk1BoFRghk", "autoAdvance": false, "transcriptAvailableTranslationsUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08d/handler/transcript/available_translations", "captionDataDir": null, "ytMetadataEndpoint": "", "showCaptions": "true", "lmsRootURL": "https://openlearninglibrary.mit.edu", "transcriptTranslationUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08d/handler/transcript/translation/__lang__", "ytApiUrl": "https://www.youtube.com/iframe_api", "transcriptLanguages": {"en": "English"}, "speed": null, "autohideHtml5": false, "generalSpeed": 1.0, "transcriptLanguage": "en", "savedVideoPosition": 0.0, "poster": null, "sources": [], "duration": 0.0, "end": 0.0, "completionEnabled": false, "completionPercentage": 0.95, "ytTestTimeout": 1500}'
data-bumper-metadata='null'
data-autoadvance-enabled="False"
data-poster='null'
tabindex="-1"
>
<div class="focus_grabber first"></div>
<div class="tc-wrapper">
<div class="video-wrapper">
<span tabindex="0" class="spinner" aria-hidden="false" aria-label="Loading video player"></span>
<span tabindex="-1" class="btn-play fa fa-youtube-play fa-2x is-hidden" aria-hidden="true" aria-label="Play video"></span>
<div class="video-player-pre"></div>
<div class="video-player">
<div id="MIT6036L08d"></div>
<h4 class="hd hd-4 video-error is-hidden">No playable video sources found.</h4>
<h4 class="hd hd-4 video-hls-error is-hidden">
Your browser does not support this video format. Try using a different browser.
</h4>
</div>
<div class="video-player-post"></div>
<div class="closed-captions"></div>
<div class="video-controls is-hidden">
<div>
<div class="vcr"><div class="vidtime">0:00 / 0:00</div></div>
<div class="secondary-controls"></div>
</div>
</div>
</div>
</div>
<div class="focus_grabber last"></div>
</div>
</div>
</div>
</div>
</div>
<div class="xblock xblock-public_view xblock-public_view-vertical" data-block-type="vertical" data-has-score="False" data-runtime-version="1" data-course-id="course-v1:MITx+6.036+1T2019" data-init="VerticalStudentView" data-request-token="60c19cfe03ed11f099b802fa081815af" data-usage-id="block-v1:MITx+6.036+1T2019+type@vertical+block@MIT6036L08e_vert" data-graded="False" data-runtime-class="LmsRuntime">
<h2 class="hd hd-2 unit-title">Lecture: SM - MDP - the transition function</h2>
<div class="vert-mod">
<div class="vert vert-0" data-id="block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08e">
<div class="xblock xblock-public_view xblock-public_view-video xmodule_display xmodule_VideoBlock" data-block-type="video" data-has-score="False" data-runtime-version="1" data-course-id="course-v1:MITx+6.036+1T2019" data-init="XBlockToXModuleShim" data-request-token="60c19cfe03ed11f099b802fa081815af" data-usage-id="block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08e" data-graded="False" data-runtime-class="LmsRuntime">
<script type="json/xblock-args" class="xblock-json-init-args">
{"xmodule-type": "Video"}
</script>
<h3 class="hd hd-2">Lecture: SM - MDP - the transition function</h3>
<div
id="video_MIT6036L08e"
class="video closed"
data-metadata='{"saveStateEnabled": false, "autoplay": false, "publishCompletionUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08e/handler/publish_completion", "start": 0.0, "prioritizeHls": false, "saveStateUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08e/handler/xmodule_handler/save_user_state", "recordedYoutubeIsAvailable": true, "streams": "1.00:n0MvChqTumQ", "autoAdvance": false, "transcriptAvailableTranslationsUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08e/handler/transcript/available_translations", "captionDataDir": null, "ytMetadataEndpoint": "", "showCaptions": "true", "lmsRootURL": "https://openlearninglibrary.mit.edu", "transcriptTranslationUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08e/handler/transcript/translation/__lang__", "ytApiUrl": "https://www.youtube.com/iframe_api", "transcriptLanguages": {"en": "English"}, "speed": null, "autohideHtml5": false, "generalSpeed": 1.0, "transcriptLanguage": "en", "savedVideoPosition": 0.0, "poster": null, "sources": [], "duration": 0.0, "end": 0.0, "completionEnabled": false, "completionPercentage": 0.95, "ytTestTimeout": 1500}'
data-bumper-metadata='null'
data-autoadvance-enabled="False"
data-poster='null'
tabindex="-1"
>
<div class="focus_grabber first"></div>
<div class="tc-wrapper">
<div class="video-wrapper">
<span tabindex="0" class="spinner" aria-hidden="false" aria-label="Loading video player"></span>
<span tabindex="-1" class="btn-play fa fa-youtube-play fa-2x is-hidden" aria-hidden="true" aria-label="Play video"></span>
<div class="video-player-pre"></div>
<div class="video-player">
<div id="MIT6036L08e"></div>
<h4 class="hd hd-4 video-error is-hidden">No playable video sources found.</h4>
<h4 class="hd hd-4 video-hls-error is-hidden">
Your browser does not support this video format. Try using a different browser.
</h4>
</div>
<div class="video-player-post"></div>
<div class="closed-captions"></div>
<div class="video-controls is-hidden">
<div>
<div class="vcr"><div class="vidtime">0:00 / 0:00</div></div>
<div class="secondary-controls"></div>
</div>
</div>
</div>
</div>
<div class="focus_grabber last"></div>
</div>
</div>
</div>
</div>
</div>
<div class="xblock xblock-public_view xblock-public_view-vertical" data-block-type="vertical" data-has-score="False" data-runtime-version="1" data-course-id="course-v1:MITx+6.036+1T2019" data-init="VerticalStudentView" data-request-token="60c19cfe03ed11f099b802fa081815af" data-usage-id="block-v1:MITx+6.036+1T2019+type@vertical+block@MIT6036L08f_vert" data-graded="False" data-runtime-class="LmsRuntime">
<h2 class="hd hd-2 unit-title">Lecture: SM - MDP - the reward and policy functions</h2>
<div class="vert-mod">
<div class="vert vert-0" data-id="block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08f">
<div class="xblock xblock-public_view xblock-public_view-video xmodule_display xmodule_VideoBlock" data-block-type="video" data-has-score="False" data-runtime-version="1" data-course-id="course-v1:MITx+6.036+1T2019" data-init="XBlockToXModuleShim" data-request-token="60c19cfe03ed11f099b802fa081815af" data-usage-id="block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08f" data-graded="False" data-runtime-class="LmsRuntime">
<script type="json/xblock-args" class="xblock-json-init-args">
{"xmodule-type": "Video"}
</script>
<h3 class="hd hd-2">Lecture: SM - MDP - the reward and policy functions</h3>
<div
id="video_MIT6036L08f"
class="video closed"
data-metadata='{"saveStateEnabled": false, "autoplay": false, "publishCompletionUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08f/handler/publish_completion", "start": 0.0, "prioritizeHls": false, "saveStateUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08f/handler/xmodule_handler/save_user_state", "recordedYoutubeIsAvailable": true, "streams": "1.00:kyvWUCDOrfc", "autoAdvance": false, "transcriptAvailableTranslationsUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08f/handler/transcript/available_translations", "captionDataDir": null, "ytMetadataEndpoint": "", "showCaptions": "true", "lmsRootURL": "https://openlearninglibrary.mit.edu", "transcriptTranslationUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08f/handler/transcript/translation/__lang__", "ytApiUrl": "https://www.youtube.com/iframe_api", "transcriptLanguages": {"en": "English"}, "speed": null, "autohideHtml5": false, "generalSpeed": 1.0, "transcriptLanguage": "en", "savedVideoPosition": 0.0, "poster": null, "sources": [], "duration": 0.0, "end": 0.0, "completionEnabled": false, "completionPercentage": 0.95, "ytTestTimeout": 1500}'
data-bumper-metadata='null'
data-autoadvance-enabled="False"
data-poster='null'
tabindex="-1"
>
<div class="focus_grabber first"></div>
<div class="tc-wrapper">
<div class="video-wrapper">
<span tabindex="0" class="spinner" aria-hidden="false" aria-label="Loading video player"></span>
<span tabindex="-1" class="btn-play fa fa-youtube-play fa-2x is-hidden" aria-hidden="true" aria-label="Play video"></span>
<div class="video-player-pre"></div>
<div class="video-player">
<div id="MIT6036L08f"></div>
<h4 class="hd hd-4 video-error is-hidden">No playable video sources found.</h4>
<h4 class="hd hd-4 video-hls-error is-hidden">
Your browser does not support this video format. Try using a different browser.
</h4>
</div>
<div class="video-player-post"></div>
<div class="closed-captions"></div>
<div class="video-controls is-hidden">
<div>
<div class="vcr"><div class="vidtime">0:00 / 0:00</div></div>
<div class="secondary-controls"></div>
</div>
</div>
</div>
</div>
<div class="focus_grabber last"></div>
</div>
</div>
</div>
</div>
</div>
<div class="xblock xblock-public_view xblock-public_view-vertical" data-block-type="vertical" data-has-score="False" data-runtime-version="1" data-course-id="course-v1:MITx+6.036+1T2019" data-init="VerticalStudentView" data-request-token="60c19cfe03ed11f099b802fa081815af" data-usage-id="block-v1:MITx+6.036+1T2019+type@vertical+block@MIT6036L08g_vert" data-graded="False" data-runtime-class="LmsRuntime">
<h2 class="hd hd-2 unit-title">Lecture: SM - MDP - finite horizon and the value function</h2>
<div class="vert-mod">
<div class="vert vert-0" data-id="block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08g">
<div class="xblock xblock-public_view xblock-public_view-video xmodule_display xmodule_VideoBlock" data-block-type="video" data-has-score="False" data-runtime-version="1" data-course-id="course-v1:MITx+6.036+1T2019" data-init="XBlockToXModuleShim" data-request-token="60c19cfe03ed11f099b802fa081815af" data-usage-id="block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08g" data-graded="False" data-runtime-class="LmsRuntime">
<script type="json/xblock-args" class="xblock-json-init-args">
{"xmodule-type": "Video"}
</script>
<h3 class="hd hd-2">Lecture: SM - MDP - finite horizon and the value function</h3>
<div
id="video_MIT6036L08g"
class="video closed"
data-metadata='{"saveStateEnabled": false, "autoplay": false, "publishCompletionUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08g/handler/publish_completion", "start": 0.0, "prioritizeHls": false, "saveStateUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08g/handler/xmodule_handler/save_user_state", "recordedYoutubeIsAvailable": true, "streams": "1.00:PWt-TF0puSY", "autoAdvance": false, "transcriptAvailableTranslationsUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08g/handler/transcript/available_translations", "captionDataDir": null, "ytMetadataEndpoint": "", "showCaptions": "true", "lmsRootURL": "https://openlearninglibrary.mit.edu", "transcriptTranslationUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08g/handler/transcript/translation/__lang__", "ytApiUrl": "https://www.youtube.com/iframe_api", "transcriptLanguages": {"en": "English"}, "speed": null, "autohideHtml5": false, "generalSpeed": 1.0, "transcriptLanguage": "en", "savedVideoPosition": 0.0, "poster": null, "sources": [], "duration": 0.0, "end": 0.0, "completionEnabled": false, "completionPercentage": 0.95, "ytTestTimeout": 1500}'
data-bumper-metadata='null'
data-autoadvance-enabled="False"
data-poster='null'
tabindex="-1"
>
<div class="focus_grabber first"></div>
<div class="tc-wrapper">
<div class="video-wrapper">
<span tabindex="0" class="spinner" aria-hidden="false" aria-label="Loading video player"></span>
<span tabindex="-1" class="btn-play fa fa-youtube-play fa-2x is-hidden" aria-hidden="true" aria-label="Play video"></span>
<div class="video-player-pre"></div>
<div class="video-player">
<div id="MIT6036L08g"></div>
<h4 class="hd hd-4 video-error is-hidden">No playable video sources found.</h4>
<h4 class="hd hd-4 video-hls-error is-hidden">
Your browser does not support this video format. Try using a different browser.
</h4>
</div>
<div class="video-player-post"></div>
<div class="closed-captions"></div>
<div class="video-controls is-hidden">
<div>
<div class="vcr"><div class="vidtime">0:00 / 0:00</div></div>
<div class="secondary-controls"></div>
</div>
</div>
</div>
</div>
<div class="focus_grabber last"></div>
</div>
</div>
</div>
</div>
</div>
<div class="xblock xblock-public_view xblock-public_view-vertical" data-block-type="vertical" data-has-score="False" data-runtime-version="1" data-course-id="course-v1:MITx+6.036+1T2019" data-init="VerticalStudentView" data-request-token="60c19cfe03ed11f099b802fa081815af" data-usage-id="block-v1:MITx+6.036+1T2019+type@vertical+block@MIT6036L08h_vert" data-graded="False" data-runtime-class="LmsRuntime">
<h2 class="hd hd-2 unit-title">Lecture: SM - MDP - computing the value function</h2>
<div class="vert-mod">
<div class="vert vert-0" data-id="block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08h">
<div class="xblock xblock-public_view xblock-public_view-video xmodule_display xmodule_VideoBlock" data-block-type="video" data-has-score="False" data-runtime-version="1" data-course-id="course-v1:MITx+6.036+1T2019" data-init="XBlockToXModuleShim" data-request-token="60c19cfe03ed11f099b802fa081815af" data-usage-id="block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08h" data-graded="False" data-runtime-class="LmsRuntime">
<script type="json/xblock-args" class="xblock-json-init-args">
{"xmodule-type": "Video"}
</script>
<h3 class="hd hd-2">Lecture: SM - MDP - computing the value function</h3>
<div
id="video_MIT6036L08h"
class="video closed"
data-metadata='{"saveStateEnabled": false, "autoplay": false, "publishCompletionUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08h/handler/publish_completion", "start": 0.0, "prioritizeHls": false, "saveStateUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08h/handler/xmodule_handler/save_user_state", "recordedYoutubeIsAvailable": true, "streams": "1.00:MOESKBfuxSc", "autoAdvance": false, "transcriptAvailableTranslationsUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08h/handler/transcript/available_translations", "captionDataDir": null, "ytMetadataEndpoint": "", "showCaptions": "true", "lmsRootURL": "https://openlearninglibrary.mit.edu", "transcriptTranslationUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08h/handler/transcript/translation/__lang__", "ytApiUrl": "https://www.youtube.com/iframe_api", "transcriptLanguages": {"en": "English"}, "speed": null, "autohideHtml5": false, "generalSpeed": 1.0, "transcriptLanguage": "en", "savedVideoPosition": 0.0, "poster": null, "sources": [], "duration": 0.0, "end": 0.0, "completionEnabled": false, "completionPercentage": 0.95, "ytTestTimeout": 1500}'
data-bumper-metadata='null'
data-autoadvance-enabled="False"
data-poster='null'
tabindex="-1"
>
<div class="focus_grabber first"></div>
<div class="tc-wrapper">
<div class="video-wrapper">
<span tabindex="0" class="spinner" aria-hidden="false" aria-label="Loading video player"></span>
<span tabindex="-1" class="btn-play fa fa-youtube-play fa-2x is-hidden" aria-hidden="true" aria-label="Play video"></span>
<div class="video-player-pre"></div>
<div class="video-player">
<div id="MIT6036L08h"></div>
<h4 class="hd hd-4 video-error is-hidden">No playable video sources found.</h4>
<h4 class="hd hd-4 video-hls-error is-hidden">
Your browser does not support this video format. Try using a different browser.
</h4>
</div>
<div class="video-player-post"></div>
<div class="closed-captions"></div>
<div class="video-controls is-hidden">
<div>
<div class="vcr"><div class="vidtime">0:00 / 0:00</div></div>
<div class="secondary-controls"></div>
</div>
</div>
</div>
</div>
<div class="focus_grabber last"></div>
</div>
</div>
</div>
</div>
</div>
<div class="xblock xblock-public_view xblock-public_view-vertical" data-block-type="vertical" data-has-score="False" data-runtime-version="1" data-course-id="course-v1:MITx+6.036+1T2019" data-init="VerticalStudentView" data-request-token="60c19cfe03ed11f099b802fa081815af" data-usage-id="block-v1:MITx+6.036+1T2019+type@vertical+block@MIT6036L08j_vert" data-graded="False" data-runtime-class="LmsRuntime">
<h2 class="hd hd-2 unit-title">Lecture: SM - MDP - finding an optimal policy</h2>
<div class="vert-mod">
<div class="vert vert-0" data-id="block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08j">
<div class="xblock xblock-public_view xblock-public_view-video xmodule_display xmodule_VideoBlock" data-block-type="video" data-has-score="False" data-runtime-version="1" data-course-id="course-v1:MITx+6.036+1T2019" data-init="XBlockToXModuleShim" data-request-token="60c19cfe03ed11f099b802fa081815af" data-usage-id="block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08j" data-graded="False" data-runtime-class="LmsRuntime">
<script type="json/xblock-args" class="xblock-json-init-args">
{"xmodule-type": "Video"}
</script>
<h3 class="hd hd-2">Lecture: SM - MDP - finding an optimal policy</h3>
<div
id="video_MIT6036L08j"
class="video closed"
data-metadata='{"saveStateEnabled": false, "autoplay": false, "publishCompletionUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08j/handler/publish_completion", "start": 0.0, "prioritizeHls": false, "saveStateUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08j/handler/xmodule_handler/save_user_state", "recordedYoutubeIsAvailable": true, "streams": "1.00:R6mmXwNYmeg", "autoAdvance": false, "transcriptAvailableTranslationsUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08j/handler/transcript/available_translations", "captionDataDir": null, "ytMetadataEndpoint": "", "showCaptions": "true", "lmsRootURL": "https://openlearninglibrary.mit.edu", "transcriptTranslationUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08j/handler/transcript/translation/__lang__", "ytApiUrl": "https://www.youtube.com/iframe_api", "transcriptLanguages": {"en": "English"}, "speed": null, "autohideHtml5": false, "generalSpeed": 1.0, "transcriptLanguage": "en", "savedVideoPosition": 0.0, "poster": null, "sources": [], "duration": 0.0, "end": 0.0, "completionEnabled": false, "completionPercentage": 0.95, "ytTestTimeout": 1500}'
data-bumper-metadata='null'
data-autoadvance-enabled="False"
data-poster='null'
tabindex="-1"
>
<div class="focus_grabber first"></div>
<div class="tc-wrapper">
<div class="video-wrapper">
<span tabindex="0" class="spinner" aria-hidden="false" aria-label="Loading video player"></span>
<span tabindex="-1" class="btn-play fa fa-youtube-play fa-2x is-hidden" aria-hidden="true" aria-label="Play video"></span>
<div class="video-player-pre"></div>
<div class="video-player">
<div id="MIT6036L08j"></div>
<h4 class="hd hd-4 video-error is-hidden">No playable video sources found.</h4>
<h4 class="hd hd-4 video-hls-error is-hidden">
Your browser does not support this video format. Try using a different browser.
</h4>
</div>
<div class="video-player-post"></div>
<div class="closed-captions"></div>
<div class="video-controls is-hidden">
<div>
<div class="vcr"><div class="vidtime">0:00 / 0:00</div></div>
<div class="secondary-controls"></div>
</div>
</div>
</div>
</div>
<div class="focus_grabber last"></div>
</div>
</div>
</div>
</div>
</div>
<div class="xblock xblock-public_view xblock-public_view-vertical" data-block-type="vertical" data-has-score="False" data-runtime-version="1" data-course-id="course-v1:MITx+6.036+1T2019" data-init="VerticalStudentView" data-request-token="60c19cfe03ed11f099b802fa081815af" data-usage-id="block-v1:MITx+6.036+1T2019+type@vertical+block@MIT6036L08k_vert" data-graded="False" data-runtime-class="LmsRuntime">
<h2 class="hd hd-2 unit-title">Lecture: SM - MDP - infinite-horizons and the value iteration algorithm</h2>
<div class="vert-mod">
<div class="vert vert-0" data-id="block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08k">
<div class="xblock xblock-public_view xblock-public_view-video xmodule_display xmodule_VideoBlock" data-block-type="video" data-has-score="False" data-runtime-version="1" data-course-id="course-v1:MITx+6.036+1T2019" data-init="XBlockToXModuleShim" data-request-token="60c19cfe03ed11f099b802fa081815af" data-usage-id="block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08k" data-graded="False" data-runtime-class="LmsRuntime">
<script type="json/xblock-args" class="xblock-json-init-args">
{"xmodule-type": "Video"}
</script>
<h3 class="hd hd-2">Lecture: SM - MDP - infinite-horizons and the value iteration algorithm</h3>
<div
id="video_MIT6036L08k"
class="video closed"
data-metadata='{"saveStateEnabled": false, "autoplay": false, "publishCompletionUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08k/handler/publish_completion", "start": 0.0, "prioritizeHls": false, "saveStateUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08k/handler/xmodule_handler/save_user_state", "recordedYoutubeIsAvailable": true, "streams": "1.00:Nuo_8ZacVXo", "autoAdvance": false, "transcriptAvailableTranslationsUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08k/handler/transcript/available_translations", "captionDataDir": null, "ytMetadataEndpoint": "", "showCaptions": "true", "lmsRootURL": "https://openlearninglibrary.mit.edu", "transcriptTranslationUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08k/handler/transcript/translation/__lang__", "ytApiUrl": "https://www.youtube.com/iframe_api", "transcriptLanguages": {"en": "English"}, "speed": null, "autohideHtml5": false, "generalSpeed": 1.0, "transcriptLanguage": "en", "savedVideoPosition": 0.0, "poster": null, "sources": [], "duration": 0.0, "end": 0.0, "completionEnabled": false, "completionPercentage": 0.95, "ytTestTimeout": 1500}'
data-bumper-metadata='null'
data-autoadvance-enabled="False"
data-poster='null'
tabindex="-1"
>
<div class="focus_grabber first"></div>
<div class="tc-wrapper">
<div class="video-wrapper">
<span tabindex="0" class="spinner" aria-hidden="false" aria-label="Loading video player"></span>
<span tabindex="-1" class="btn-play fa fa-youtube-play fa-2x is-hidden" aria-hidden="true" aria-label="Play video"></span>
<div class="video-player-pre"></div>
<div class="video-player">
<div id="MIT6036L08k"></div>
<h4 class="hd hd-4 video-error is-hidden">No playable video sources found.</h4>
<h4 class="hd hd-4 video-hls-error is-hidden">
Your browser does not support this video format. Try using a different browser.
</h4>
</div>
<div class="video-player-post"></div>
<div class="closed-captions"></div>
<div class="video-controls is-hidden">
<div>
<div class="vcr"><div class="vidtime">0:00 / 0:00</div></div>
<div class="secondary-controls"></div>
</div>
</div>
</div>
</div>
<div class="focus_grabber last"></div>
</div>
</div>
</div>
</div>
</div>
<div class="xblock xblock-public_view xblock-public_view-vertical" data-block-type="vertical" data-has-score="False" data-runtime-version="1" data-course-id="course-v1:MITx+6.036+1T2019" data-init="VerticalStudentView" data-request-token="60c19cfe03ed11f099b802fa081815af" data-usage-id="block-v1:MITx+6.036+1T2019+type@vertical+block@state_machines_markov_decision_processes_vert" data-graded="False" data-runtime-class="LmsRuntime">
<h2 class="hd hd-2 unit-title">Markov decision processes</h2>
<div class="vert-mod">
<div class="vert vert-0" data-id="block-v1:MITx+6.036+1T2019+type@html+block@state_machines_markov_decision_processes">
<div class="xblock xblock-public_view xblock-public_view-html xmodule_display xmodule_HtmlBlock" data-block-type="html" data-has-score="False" data-runtime-version="1" data-course-id="course-v1:MITx+6.036+1T2019" data-init="XBlockToXModuleShim" data-request-token="60c19cfe03ed11f099b802fa081815af" data-usage-id="block-v1:MITx+6.036+1T2019+type@html+block@state_machines_markov_decision_processes" data-graded="False" data-runtime-class="LmsRuntime">
<script type="json/xblock-args" class="xblock-json-init-args">
{"xmodule-type": "HTMLModule"}
</script>
<p>
A <em>Markov decision process</em> (<i class="sc">mdp</i>) is a variation on a state machine in which: </p><ul class="itemize"><li><p>
The transition function is <em>stochastic</em>, <span options="" class="marginote"><span class="marginote_desc" style="display:none">Recall that stochastic is another word for <em>probabilistic</em>; we don't say “random" because that can be interpreted in two ways, both of which are incorrect. We don't pick the transition function itself at random from a distribution. The transition function doesn't pick its output <em>uniformly</em> at random.</span><span>note</span></span> meaning that it defines a probability distribution over the next state given the previous state and input, but each time it is evaluated it draws a new state from that distribution. </p></li><li><p>
The output is equal to the state (that is [mathjaxinline]g[/mathjaxinline] is the identity function). <span options="" class="marginote"><span class="marginote_desc" style="display:none">There is an interesting variation on <i class="sc">mdp</i>s, called a <em>partially observable</em> <i class="sc">mdp</i>, in which the output is also drawn from a distribution depending on the state.</span><span>note</span></span> </p></li><li><p>
Some states (or state-action pairs) are more desirable than others. </p></li></ul><p>
An <i class="sc">mdp</i> can be used to model interaction with an outside “world," such as a single-player <span options="" class="marginote"><span class="marginote_desc" style="display:none">And there is an interesting, direct extension to two-player zero-sum games, such as Chess and Go.</span><span>game. </span></span> We will focus on the case in which [mathjaxinline]\mathcal S[/mathjaxinline] and [mathjaxinline]\mathcal X[/mathjaxinline] are finite, and will call the input set [mathjaxinline]\mathcal A[/mathjaxinline] for <em>actions</em> (rather than [mathjaxinline]\mathcal X[/mathjaxinline]). The idea is that an agent (a robot or a game-player) can model its environment as an <i class="sc">mdp</i> and try to choose actions that will drive the process into states that have high scores. </p><p>
Formally, an MDP is [mathjaxinline]\langle \mathcal S, \mathcal A, T, R, \gamma \rangle[/mathjaxinline] where: </p><ul class="itemize"><li><p>
[mathjaxinline]T : \mathcal S \times \mathcal A \times \mathcal S \rightarrow \mathbb {R}[/mathjaxinline] is a <em>transition model</em>, where </p><table id="a0000000016" class="equation" width="100%" cellspacing="0" cellpadding="7" style="table-layout:auto"><tr><td class="equation" style="width:80%; border:none">[mathjax]T(s, a, s') = P(S_ t = s'|S_{t - 1} = s, A_{t - 1} = a)\; \; ,[/mathjax]</td><td class="eqnnum" style="width:20%; border:none"> </td></tr></table><p>
specifying a conditional probability distribution; <span options="" class="marginote"><span class="marginote_desc" style="display:none">The notation here uses capital letters, like [mathjaxinline]S[/mathjaxinline], to stand for random variables and small letters to stand for concrete values. So [mathjaxinline]S_ t[/mathjaxinline] here is a random variable that can take on elements of [mathjaxinline]\mathcal S[/mathjaxinline] as values.</span><span>note</span></span> </p></li><li><p>
[mathjaxinline]R: \mathcal S \times \mathcal A \rightarrow \mathbb {R}[/mathjaxinline] is a reward function, where [mathjaxinline]R(s, a)[/mathjaxinline] specifies how desirable it is to be in state [mathjaxinline]s[/mathjaxinline] and take action [mathjaxinline]a[/mathjaxinline]; and </p></li><li><p>
[mathjaxinline]\gamma \in [0, 1][/mathjaxinline] is a <em>discount factor</em>, which we'll discuss in section <ref/>. </p></li></ul><p>
A <i class="it">policy</i> is a function [mathjaxinline]\pi : \mathcal S \rightarrow \mathcal A[/mathjaxinline] that specifies what action to take in each state. </p><p><h3>Finite-horizon solutions</h3></p><p>
Given an <i class="sc">mdp</i>, our goal is typically to find a policy that is optimal in the sense that it gets as much total reward as possible, in expectation over the stochastic transitions that the domain makes. In this section, we will consider the case where there is a finite <em>horizon</em> [mathjaxinline]H[/mathjaxinline], indicating the total number of steps of interaction that the agent will have with the <i class="sc">mdp</i>. </p><p><h4>Evaluating a given policy</h4></p><p>
Before we can talk about how to find a good policy, we have to specify a measure of the goodness of a policy. We will do so by defining for a given <i class="sc">mdp</i> policy [mathjaxinline]\pi[/mathjaxinline] and horizon [mathjaxinline]h[/mathjaxinline], the “horizon [mathjaxinline]h[/mathjaxinline] <em>value</em>" of a state, [mathjaxinline]V^{h}_\pi (s)[/mathjaxinline]. We do this by induction on the horizon, which is the <em>number of steps left to go</em>. </p><p>
The base case is when there are no steps remaining, in which case, no matter what state we're in, the value is 0, so </p><table id="a0000000017" class="equation" width="100%" cellspacing="0" cellpadding="7" style="table-layout:auto"><tr><td class="equation" style="width:80%; border:none">[mathjax]V^0_{\pi }(s) = 0\; \; .[/mathjax]</td><td class="eqnnum" style="width:20%; border:none"> </td></tr></table><p>
Then, the value of a policy in state [mathjaxinline]s[/mathjaxinline] at horizon [mathjaxinline]h + 1[/mathjaxinline] is equal to the reward it will get in state [mathjaxinline]s[/mathjaxinline] plus the next state's expected horizon [mathjaxinline]h[/mathjaxinline] value. So, starting with horizons 1 and 2, and then moving to the general case, we have: </p><table id="a0000000018" cellpadding="7" width="100%" cellspacing="0" class="eqnarray" style="table-layout:auto"><tr id="a0000000019"><td style="width:40%; border:none"> </td><td style="vertical-align:middle; text-align:right; border:none">
[mathjaxinline]\displaystyle V^1_{\pi }(s)[/mathjaxinline]
</td><td style="vertical-align:middle; text-align:left; border:none">
[mathjaxinline]\displaystyle = R(s, \pi (s)) + 0[/mathjaxinline]
</td><td style="width:40%; border:none"> </td><td style="width:20%; border:none" class="eqnnum"> </td></tr><tr id="a0000000020"><td style="width:40%; border:none"> </td><td style="vertical-align:middle; text-align:right; border:none">
[mathjaxinline]\displaystyle V^2_{\pi }(s)[/mathjaxinline]
</td><td style="vertical-align:middle; text-align:left; border:none">
[mathjaxinline]\displaystyle = R(s, \pi (s)) + \sum _{s'}T(s, \pi (s), s') \cdot R(s', \pi (s'))[/mathjaxinline]
</td><td style="width:40%; border:none"> </td><td style="width:20%; border:none" class="eqnnum"> </td></tr><tr id="a0000000021"><td style="width:40%; border:none"> </td><td style="vertical-align:middle; text-align:right; border:none">
[mathjaxinline]\displaystyle \vdots[/mathjaxinline]
</td><td style="width:40%; border:none"> </td><td style="width:20%; border:none" class="eqnnum"> </td></tr><tr id="a0000000022"><td style="width:40%; border:none"> </td><td style="vertical-align:middle; text-align:right; border:none">
[mathjaxinline]\displaystyle V^ h_{\pi }(s)[/mathjaxinline]
</td><td style="vertical-align:middle; text-align:left; border:none">
[mathjaxinline]\displaystyle = R(s, \pi (s)) + \sum _{s'}T(s, \pi (s), s') \cdot V^{h - 1}_{\pi }(s')[/mathjaxinline]
</td><td style="width:40%; border:none"> </td><td style="width:20%; border:none" class="eqnnum"> </td></tr></table><p>
The sum over [mathjaxinline]s'[/mathjaxinline] is an <em>expected value</em>: it considers all possible next states [mathjaxinline]s'[/mathjaxinline], and computes an average of their [mathjaxinline](h-1)[/mathjaxinline]-horizon values, weighted by the probability that the transition function from state [mathjaxinline]s[/mathjaxinline] with the action chosen by the policy, [mathjaxinline]\pi (s)[/mathjaxinline], assigns to arriving in state [mathjaxinline]s'[/mathjaxinline]. <br/> <br/><span style="color:#FF0000"><b class="bf">Study Question:</b></span> <span style="color:#0000FF">What is [mathjaxinline]\sum _{s'} T(s, a, s')[/mathjaxinline] for any particular [mathjaxinline]s[/mathjaxinline] and [mathjaxinline]a[/mathjaxinline]? </span> <br/></p><p>
Then we can say that a policy [mathjaxinline]\pi _1[/mathjaxinline] is better than policy [mathjaxinline]\pi _2[/mathjaxinline] for horizon [mathjaxinline]h[/mathjaxinline], i.e. [mathjaxinline]\pi _1 >_ h \pi _2[/mathjaxinline], if and only if for all [mathjaxinline]s \in \mathcal S[/mathjaxinline], [mathjaxinline]V_{\pi _1}^ h(s) \geq V_{\pi _2}^ h(s)[/mathjaxinline] and there exists at least one [mathjaxinline]s \in \mathcal S[/mathjaxinline] such that [mathjaxinline]V_{\pi _1}^ h(s) > V_{\pi _2}^ h(s)[/mathjaxinline]. </p><p><h4>Finding an optimal policy</h4></p><p>
How can we go about finding an optimal policy for an <i class="sc">mdp</i>? We could imagine enumerating all possible policies and calculating their value functions as in the previous section and picking the best one...but that's too much work! </p><p>
The first observation to make is that, in a finite-horizon problem, the best action to take depends on the current state, but also on the horizon: imagine that you are in a situation where you could reach a state with reward 5 in one step or a state with reward 10 in two steps. If you have at least two steps to go, then you'd move toward the reward 10 state, but if you only have step left to go, you should go in the direction that will allow you to gain 5! </p><p>
One way to find an optimal policy is to compute an <em>optimal action-value function</em>, [mathjaxinline]Q[/mathjaxinline]. We define [mathjaxinline]Q^ h(s, a)[/mathjaxinline] to be the expected value of </p><ul class="itemize"><li><p>
starting in state [mathjaxinline]s[/mathjaxinline], </p></li><li><p>
executing action [mathjaxinline]a[/mathjaxinline], and </p></li><li><p>
continuing for [mathjaxinline]h - 1[/mathjaxinline] more steps executing an optimal policy for the appropriate horizon on each step. </p></li></ul><p>
Similar to our definition of [mathjaxinline]V[/mathjaxinline] for evaluating a policy, we define the [mathjaxinline]Q[/mathjaxinline] function recursively according to the horizon. The only difference is that, on each step with horizon [mathjaxinline]h[/mathjaxinline], rather than selecting an action specified by a given policy, we select the value of [mathjaxinline]a[/mathjaxinline] that will maximize the expected [mathjaxinline]Q^ h[/mathjaxinline] value of the next state. </p><table id="a0000000023" cellpadding="7" width="100%" cellspacing="0" class="eqnarray" style="table-layout:auto"><tr id="a0000000024"><td style="width:40%; border:none"> </td><td style="vertical-align:middle; text-align:right; border:none">
[mathjaxinline]\displaystyle Q^0(s, a)[/mathjaxinline]
</td><td style="vertical-align:middle; text-align:left; border:none">
[mathjaxinline]\displaystyle = 0[/mathjaxinline]
</td><td style="width:40%; border:none"> </td><td style="width:20%; border:none" class="eqnnum"> </td></tr><tr id="a0000000025"><td style="width:40%; border:none"> </td><td style="vertical-align:middle; text-align:right; border:none">
[mathjaxinline]\displaystyle Q^1(s, a)[/mathjaxinline]
</td><td style="vertical-align:middle; text-align:left; border:none">
[mathjaxinline]\displaystyle = R(s, a) + 0[/mathjaxinline]
</td><td style="width:40%; border:none"> </td><td style="width:20%; border:none" class="eqnnum"> </td></tr><tr id="a0000000026"><td style="width:40%; border:none"> </td><td style="vertical-align:middle; text-align:right; border:none">
[mathjaxinline]\displaystyle Q^2(s, a)[/mathjaxinline]
</td><td style="vertical-align:middle; text-align:left; border:none">
[mathjaxinline]\displaystyle = R(s, a) + \sum _{s'}T(s, a, s') \max _{a'} R(s', a')[/mathjaxinline]
</td><td style="width:40%; border:none"> </td><td style="width:20%; border:none" class="eqnnum"> </td></tr><tr id="a0000000027"><td style="width:40%; border:none"> </td><td style="vertical-align:middle; text-align:right; border:none">
[mathjaxinline]\displaystyle \vdots[/mathjaxinline]
</td><td style="width:40%; border:none"> </td><td style="width:20%; border:none" class="eqnnum"> </td></tr><tr id="a0000000028"><td style="width:40%; border:none"> </td><td style="vertical-align:middle; text-align:right; border:none">
[mathjaxinline]\displaystyle Q^ h(s, a)[/mathjaxinline]
</td><td style="vertical-align:middle; text-align:left; border:none">
[mathjaxinline]\displaystyle = R(s, a) + \sum _{s'}T(s, a, s') \max _{a'} Q^{h - 1}(s', a')[/mathjaxinline]
</td><td style="width:40%; border:none"> </td><td style="width:20%; border:none" class="eqnnum"> </td></tr></table><p>
We can solve for the values of [mathjaxinline]Q[/mathjaxinline] with a simple recursive algorithm called <i class="it">value iteration</i> which just computes [mathjaxinline]Q^ h[/mathjaxinline] starting from horizon 0 and working backward to the desired horizon [mathjaxinline]H[/mathjaxinline]. Given [mathjaxinline]Q[/mathjaxinline], an optimal policy is easy to find: </p><table id="a0000000029" class="equation" width="100%" cellspacing="0" cellpadding="7" style="table-layout:auto"><tr><td class="equation" style="width:80%; border:none">[mathjax]\pi _ h^*(s) = \text {arg}\max _{a}Q^ h(s, a) \; \; .[/mathjax]</td><td class="eqnnum" style="width:20%; border:none"> </td></tr></table><p>
There may be multiple possible optimal policies. </p><p><div style="border-radius:10px;padding:5px;border-style:solid;background-color:rgba(0,255,0,0.03);" class="examplebox"><p><b class="bf">Dynamic programming</b> (somewhat counter-intuitively, dynamic programming is neither really “dynamic" nor a type of “programming" as we typically understand it.) is a technique for designing efficient algorithms. Most methods for solving MDPs or computing value functions rely on dynamic programming to be efficient. </p><p>
The <em>principle of dynamic programming</em> is to compute and store the solutions to simple sub-problems that can be re-used later in the computation. It is a very important tool in our algorithmic toolbox. </p><p>
Let's consider what would happen if we tried to compute [mathjaxinline]Q^4(s, a)[/mathjaxinline] for all [mathjaxinline](s, a)[/mathjaxinline] by directly using the definition: </p><ul class="itemize"><li><p>
To compute [mathjaxinline]Q^4(s_ i, a_ j)[/mathjaxinline] for any one [mathjaxinline](s_ i, a_ j)[/mathjaxinline], we would need to compute [mathjaxinline]Q^3(s, a)[/mathjaxinline] for all [mathjaxinline](s, a)[/mathjaxinline] pairs. </p></li><li><p>
To compute [mathjaxinline]Q^3(s_ i, a_ j)[/mathjaxinline] for any one [mathjaxinline](s_ i, a_ j)[/mathjaxinline], we'd need to compute [mathjaxinline]Q^2(s, a)[/mathjaxinline] for all [mathjaxinline](s, a)[/mathjaxinline] pairs. </p></li><li><p>
To compute [mathjaxinline]Q^2(s_ i, a_ j)[/mathjaxinline] for any one [mathjaxinline](s_ i, a_ j)[/mathjaxinline], we'd need to compute [mathjaxinline]Q^1(s, a)[/mathjaxinline] for all [mathjaxinline](s, a)[/mathjaxinline] pairs. </p></li><li><p>
Luckily, those are just our [mathjaxinline]R(s, a)[/mathjaxinline] values. </p></li></ul><p>
So, if we have [mathjaxinline]n[/mathjaxinline] states and [mathjaxinline]m[/mathjaxinline] actions, this is [mathjaxinline]O((mn)^3)[/mathjaxinline] work—that seems like way too much, especially as the horizon increases! But observe that we really only have [mathjaxinline]mnh[/mathjaxinline] values that need to be computed, [mathjaxinline]Q^ h(s, a)[/mathjaxinline] for all [mathjaxinline]h, s, a[/mathjaxinline]. If we start with [mathjaxinline]h=1[/mathjaxinline], compute and store those values, then using and reusing the [mathjaxinline]Q^{h-1}(s, a)[/mathjaxinline] values to compute the [mathjaxinline]Q^ h(s, a)[/mathjaxinline] values, we can do all this computation in time [mathjaxinline]O(mnh)[/mathjaxinline], which is much better! </p></div></p><p><h3>Infinite-horizon solutions</h3></p><p>
It is actually more typical to work in a regime where the actual finite horizon is not known. This is called the <em>infinite horizon</em> version of the problem, when you don't know when the game will be over! However, if we tried to simply take our definition of [mathjaxinline]Q^ h[/mathjaxinline] above and set [mathjaxinline]h = \infty[/mathjaxinline], we would be in trouble, because it could well be that the [mathjaxinline]Q^\infty[/mathjaxinline] values for all actions would be infinite, and there would be no way to select one over the other. </p><p>
There are two standard ways to deal with this problem. One is to take a kind of <em>average</em> over all time steps, but this can be a little bit tricky to think about. We'll take a different approach, which is to consider the <em>discounted</em> infinite horizon. We select a discount factor [mathjaxinline]0 < \gamma < 1[/mathjaxinline]. Instead of trying to find a policy that maximizes expected finite-horizon undiscounted value, </p><table id="a0000000030" class="equation" width="100%" cellspacing="0" cellpadding="7" style="table-layout:auto"><tr><td class="equation" style="width:80%; border:none">[mathjax]\mathbb {E}\left[\sum _{t = 0}^{h}R_ t \mid \pi , s_0\right]\; \; ,[/mathjax]</td><td class="eqnnum" style="width:20%; border:none"> </td></tr></table><p>
we will try to find one that maximizes the expected <i class="it">infinite horizon discounted value</i>, which is </p><table id="a0000000031" class="equation" width="100%" cellspacing="0" cellpadding="7" style="table-layout:auto"><tr><td class="equation" style="width:80%; border:none">[mathjax]\mathbb {E}\left[\sum _{t = 0}^{\infty }\gamma ^ tR_ t \mid \pi , S_0\right] = \mathbb {E}\left[R_0 + \gamma R_1 + \gamma ^2 R_2 + \ldots \mid \pi , s_0\right] \; \; .[/mathjax]</td><td class="eqnnum" style="width:20%; border:none"> </td></tr></table><p>
Note that the [mathjaxinline]t[/mathjaxinline] indices here are not the number of steps to go, but actually the number of steps forward from the starting state (there is no sensible notion of “steps to go" in the infinite horizon case). </p><p>
There are two good intuitive motivations for discounting. One is related to economic theory and the present value of money: you'd generally rather have some money today than that same amount of money next week (because you could use it now or invest it). The other is to think of the whole process terminating, with probability [mathjaxinline]1-\gamma[/mathjaxinline] on each step of the interaction. This value is the expected amount of reward the agent would gain under this terminating model. </p><p><h4>Evaluating a policy</h4> We will start, again, by evaluating a policy, but now in terms of the expected discounted infinite-horizon value that the agent will get in the <i class="sc">mdp</i> if it executes that policy. We define the value of a state [mathjaxinline]s[/mathjaxinline] under policy [mathjaxinline]\pi[/mathjaxinline] as </p><table id="a0000000032" class="equation" width="100%" cellspacing="0" cellpadding="7" style="table-layout:auto"><tr><td class="equation" style="width:80%; border:none">[mathjax]V_{\pi }(s) = \mathbb {E}[R_0 + \gamma R_1 + \gamma ^2 R_2 + \dots \mid \pi , S_0 = s] = \mathbb {E}[R_0 + \gamma (R_1 + \gamma (R_2 + \gamma \dots ))) \mid \pi , S_0 = s] \; \; .[/mathjax]</td><td class="eqnnum" style="width:20%; border:none"> </td></tr></table><p>
Because the expectation of a linear combination of random variables is the linear combination of the expectations, we have </p><table id="a0000000033" cellpadding="7" width="100%" cellspacing="0" class="eqnarray" style="table-layout:auto"><tr id="a0000000034"><td style="width:40%; border:none"> </td><td style="vertical-align:middle; text-align:right; border:none">
[mathjaxinline]\displaystyle V_{\pi }(s)[/mathjaxinline]
</td><td style="vertical-align:middle; text-align:left; border:none">
[mathjaxinline]\displaystyle = \mathbb {E}[R_0 \mid \pi , S_0 = s] + \gamma \mathbb {E}[ R_1 + \gamma (R_2 + \gamma \dots ))) \mid \pi , S_0 = s][/mathjaxinline]
</td><td style="width:40%; border:none"> </td><td style="width:20%; border:none" class="eqnnum"> </td></tr><tr id="a0000000035"><td style="width:40%; border:none"> </td><td style="vertical-align:middle; text-align:right; border:none">
</td><td style="vertical-align:middle; text-align:left; border:none">
[mathjaxinline]\displaystyle = R(s, \pi (s)) + \gamma \sum _{s'}T(s, \pi (s), s')V_{\pi }(s')[/mathjaxinline]
</td><td style="width:40%; border:none"> </td><td style="width:20%; border:none" class="eqnnum"> </td></tr></table><p><span options="" class="marginote"><span class="marginote_desc" style="display:none">This is <em>so</em> cool! In a discounted model, if you find that you survived this round and landed in some state [mathjaxinline]s'[/mathjaxinline], then you have the same expected future lifetime as you did before. So the value function that is relevant in that state is exactly the same one as in state [mathjaxinline]s[/mathjaxinline].</span><span>note</span></span></p><p>
You could write down one of these equations for each of the [mathjaxinline]n = |\mathcal S|[/mathjaxinline] states. There are [mathjaxinline]n[/mathjaxinline] unknowns [mathjaxinline]V_{\pi }(s)[/mathjaxinline]. These are linear equations, and so it's easy to solve them using Gaussian elimination to find the value of each state under this policy. </p><p><h4>Finding an optimal policy</h4> The best way of behaving in an infinite-horizon discounted <i class="sc">mdp</i> is not time-dependent: at every step, your expected future lifetime, given that you have survived until now, is [mathjaxinline]1 / (1 - \gamma )[/mathjaxinline]. <br/> <br/><span style="color:#FF0000"><b class="bf">Study Question:</b></span> <span style="color:#0000FF"> Verify this fact: if, on every day you wake up, there is a probability of [mathjaxinline]1 - \gamma[/mathjaxinline] that today will be your last day, then your expected lifetime is [mathjaxinline]1 / (1 - \gamma )[/mathjaxinline] days. </span> <br/></p><p>
An important theorem about <i class="sc">mdp</i>s is: there <span options="" class="marginote"><span class="marginote_desc" style="display:none">Stationary means that it doesn't change over time; the optimal policy in a finite-horizon <i class="sc">mdp</i> is <em>non-stationary.</em></span><span>exists a stationary </span></span> optimal policy [mathjaxinline]\pi ^*[/mathjaxinline] (there may be more than one) such that for all [mathjaxinline]s \in \mathcal S[/mathjaxinline] and all other policies [mathjaxinline]\pi[/mathjaxinline], we have </p><table id="a0000000036" class="equation" width="100%" cellspacing="0" cellpadding="7" style="table-layout:auto"><tr><td class="equation" style="width:80%; border:none">[mathjax]V_{\pi ^*}(s) \ge V_{\pi }(s) \; \; .[/mathjax]</td><td class="eqnnum" style="width:20%; border:none"> </td></tr></table><p>
There are many methods for finding an optimal policy for an <i class="sc">mdp</i>. We will study a very popular and useful method called <em>value iteration</em>. It is also important to us, because it is the basis of many <em>reinforcement-learning</em> methods. </p><p>
Define [mathjaxinline]Q^*(s, a)[/mathjaxinline] to be the expected infinite-horizon discounted value of being in state [mathjaxinline]s[/mathjaxinline], executing action [mathjaxinline]a[/mathjaxinline], and executing an optimal policy [mathjaxinline]\pi ^*[/mathjaxinline] thereafter. Using similar reasoning to the recursive definition of [mathjaxinline]V_\pi[/mathjaxinline], we can express this value recursively as </p><table id="a0000000037" class="equation" width="100%" cellspacing="0" cellpadding="7" style="table-layout:auto"><tr><td class="equation" style="width:80%; border:none">[mathjax]Q^*(s, a) = R(s, a) + \gamma \sum _{s'}T(s, a, s')\max _{a'}Q^*(s', a') \; \; .[/mathjax]</td><td class="eqnnum" style="width:20%; border:none"> </td></tr></table><p>
This is also a set of equations, one for each [mathjaxinline](s, a)[/mathjaxinline] pair. This time, though, they are not linear, and so they are not easy to solve. But there is a theorem that says they have a unique solution! </p><p>
If we knew the optimal action-value function, then we could derive an optimal policy [mathjaxinline]\pi ^*[/mathjaxinline] as </p><table id="a0000000038" class="equation" width="100%" cellspacing="0" cellpadding="7" style="table-layout:auto"><tr><td class="equation" style="width:80%; border:none">[mathjax]\pi ^*(s) = \text {arg}\max _{a}Q^*(s, a) \; \; .[/mathjax]</td><td class="eqnnum" style="width:20%; border:none"> </td></tr></table><p>
<br/> <br/><span style="color:#FF0000"><b class="bf">Study Question:</b></span> <span style="color:#0000FF">The optimal value function is unique, but the optimal policy is not. Think of a situation in which there is more than one optimal policy.</span> <br/></p><p>
We can iteratively solve for the [mathjaxinline]Q^*[/mathjaxinline] values with the value iteration algorithm, shown below: </p><p><img src="/assets/courseware/v1/2de4f2f26f69961079a98dcba45dee2f/asset-v1:MITx+6.036+1T2019+type@asset+block/images_state_machines_markov_decision_processes_codebox_1-crop.png" width="691"/></p><p><h4>Theory</h4></p><p>
There are a lot of nice theoretical results about value iteration. For some given (not necessarily optimal) [mathjaxinline]Q[/mathjaxinline] function, define [mathjaxinline]\pi _{Q}(s) = \text {arg}\max _{a}Q(s, a)[/mathjaxinline]. </p><ul class="itemize"><li><p>
After executing value iteration with parameter [mathjaxinline]\epsilon[/mathjaxinline], [mathjaxinline]\lVert V_{\pi _{Q_{\text {new}}}} - V_{\pi ^*} \rVert _{\text {max}} < \epsilon[/mathjaxinline]. <span options="" class="marginote"><span class="marginote_desc" style="display:none">This is new notation! Given two functions [mathjaxinline]f[/mathjaxinline] and [mathjaxinline]f'[/mathjaxinline], we write [mathjaxinline]\lVert f - f' \rVert _\text {max}[/mathjaxinline] to mean [mathjaxinline]\max _ x \lvert f(x) - f'(x)\rvert[/mathjaxinline]. It measures the maximum absolute disagreement between the two functions at any input [mathjaxinline]x[/mathjaxinline].</span><span>note</span></span> </p></li><li><p>
There is a value of [mathjaxinline]\epsilon[/mathjaxinline] such that </p><table id="a0000000039" class="equation" width="100%" cellspacing="0" cellpadding="7" style="table-layout:auto"><tr><td class="equation" style="width:80%; border:none">[mathjax]\Vert Q_{\text {old}} - Q_{\text {new}} \rVert _{\text {max}} < \epsilon \Longrightarrow \pi _{Q_{\text {new}}} = \pi ^*[/mathjax]</td><td class="eqnnum" style="width:20%; border:none"> </td></tr></table></li><li><p>
As the algorithm executes, [mathjaxinline]\lVert V_{\pi _{Q_{\text {new}}}} - V_{\pi ^*} \Vert _{\text {max}}[/mathjaxinline] decreases monotonically on each iteration. </p></li><li><p>
The algorithm can be executed asynchronously, in parallel: as long as all [mathjaxinline](s, a)[/mathjaxinline] pairs are updated infinitely often in an infinite run, it still converges <span options="" class="marginote"><span class="marginote_desc" style="display:none">This is very important for reinforcement learning.</span><span>to optimal value. </span></span> </p></li></ul><p>
<br/></p><p>
<br/></p><p><a href="/assets/courseware/v1/caee9a9ff60ed183e4485869c2e88ac4/asset-v1:MITx+6.036+1T2019+type@asset+block/notes_chapter_Sequential_models.pdf" target="_blank">Download this chapter as a PDF file</a></p><script src="/assets/courseware/v1/1ab2c06aefab58693cfc9c10394b7503/asset-v1:MITx+6.036+1T2019+type@asset+block/marginotes.js" type="text/javascript"/><span><br/><span style="color:gray;font-size:10pt"><center>This page was last updated on Friday May 24, 2019; 02:29:23 PM (revision 4f166135)</center></span></span>
</div>
</div>
</div>
</div>
<div class="xblock xblock-public_view xblock-public_view-vertical" data-block-type="vertical" data-has-score="False" data-runtime-version="1" data-course-id="course-v1:MITx+6.036+1T2019" data-init="VerticalStudentView" data-request-token="60c19cfe03ed11f099b802fa081815af" data-usage-id="block-v1:MITx+6.036+1T2019+type@vertical+block@MIT6036L08m_vert" data-graded="False" data-runtime-class="LmsRuntime">
<h2 class="hd hd-2 unit-title">Lecture: SM - MDP - Grid world example demos</h2>
<div class="vert-mod">
<div class="vert vert-0" data-id="block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08m">
<div class="xblock xblock-public_view xblock-public_view-video xmodule_display xmodule_VideoBlock" data-block-type="video" data-has-score="False" data-runtime-version="1" data-course-id="course-v1:MITx+6.036+1T2019" data-init="XBlockToXModuleShim" data-request-token="60c19cfe03ed11f099b802fa081815af" data-usage-id="block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08m" data-graded="False" data-runtime-class="LmsRuntime">
<script type="json/xblock-args" class="xblock-json-init-args">
{"xmodule-type": "Video"}
</script>
<h3 class="hd hd-2">Lecture: SM - MDP - Grid world example demos</h3>
<div
id="video_MIT6036L08m"
class="video closed"
data-metadata='{"saveStateEnabled": false, "autoplay": false, "publishCompletionUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08m/handler/publish_completion", "start": 0.0, "prioritizeHls": false, "saveStateUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08m/handler/xmodule_handler/save_user_state", "recordedYoutubeIsAvailable": true, "streams": "1.00:0eFJvy2U0BU", "autoAdvance": false, "transcriptAvailableTranslationsUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08m/handler/transcript/available_translations", "captionDataDir": null, "ytMetadataEndpoint": "", "showCaptions": "true", "lmsRootURL": "https://openlearninglibrary.mit.edu", "transcriptTranslationUrl": "/courses/course-v1:MITx+6.036+1T2019/xblock/block-v1:MITx+6.036+1T2019+type@video+block@MIT6036L08m/handler/transcript/translation/__lang__", "ytApiUrl": "https://www.youtube.com/iframe_api", "transcriptLanguages": {"en": "English"}, "speed": null, "autohideHtml5": false, "generalSpeed": 1.0, "transcriptLanguage": "en", "savedVideoPosition": 0.0, "poster": null, "sources": [], "duration": 0.0, "end": 0.0, "completionEnabled": false, "completionPercentage": 0.95, "ytTestTimeout": 1500}'
data-bumper-metadata='null'
data-autoadvance-enabled="False"
data-poster='null'
tabindex="-1"
>
<div class="focus_grabber first"></div>
<div class="tc-wrapper">
<div class="video-wrapper">
<span tabindex="0" class="spinner" aria-hidden="false" aria-label="Loading video player"></span>
<span tabindex="-1" class="btn-play fa fa-youtube-play fa-2x is-hidden" aria-hidden="true" aria-label="Play video"></span>
<div class="video-player-pre"></div>
<div class="video-player">
<div id="MIT6036L08m"></div>
<h4 class="hd hd-4 video-error is-hidden">No playable video sources found.</h4>
<h4 class="hd hd-4 video-hls-error is-hidden">
Your browser does not support this video format. Try using a different browser.
</h4>
</div>
<div class="video-player-post"></div>
<div class="closed-captions"></div>
<div class="video-controls is-hidden">
<div>
<div class="vcr"><div class="vidtime">0:00 / 0:00</div></div>
<div class="secondary-controls"></div>
</div>
</div>
</div>
</div>
<div class="focus_grabber last"></div>
</div>
</div>
</div>
</div>
</div>
© All Rights Reserved