JackArch
diff --git a/‎deep-learning/tensor-flow-examples/Harry Potter Book 8.ipynb
+363 b/‎deep-learning/tensor-flow-examples/Harry Potter Book 8.ipynb
+363
@@ -0,0 +1,363 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import tensorflow as tf\n",
+    "import codecs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Loading the stuff "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### check if the books exist "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import glob\n",
+    "\n",
+    "book_filenames = sorted(glob.glob(\"data/*txt\"))\n",
+    "\n",
+    "print(\"Found {} books\".format(len(book_filenames)))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Joining the books into a string "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "potter_raw = u\"\"\n",
+    "for filename in book_filenames:\n",
+    "    with codecs.open(filename, 'r', 'utf-8') as book_file:\n",
+    "        potter_raw += book_file.read()\n",
+    "print(\"Potter is \", len(potter_raw), \" characters long\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Process Potter "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### create lookup tables "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def lookup_tables(text):\n",
+    "    vocab = set(text)\n",
+    "    int_to_vocab = {key: word for key, word in enumerate(vocab)}\n",
+    "    vocab_to_int = {word: key for key, word in enumerate(vocab)}\n",
+    "    return vocab_to_int, int_to_vocab"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Tokenize punctuation "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def token_lookup():\n",
+    "    \"\"\"\n",
+    "    Generate a dict to map punctuation into a token\n",
+    "    :return: dictionary mapping puncuation to token\n",
+    "    \"\"\"\n",
+    "    return {\n",
+    "        '.': '||period||',\n",
+    "        ',': '||comma||',\n",
+    "        '\"': '||quotes||',\n",
+    "        ';': '||semicolon||',\n",
+    "        '!': '||exclamation-mark||',\n",
+    "        '?': '||question-mark||',\n",
+    "        '(': '||left-parentheses||',\n",
+    "        ')': '||right-parentheses||',\n",
+    "        '--': '||emm-dash||',\n",
+    "        '\\n': '||return||'\n",
+    "        \n",
+    "    }"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Process and save data "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pickle\n",
+    "\n",
+    "token_dict = token_lookup()\n",
+    "for token, replacement in token_dict.items():\n",
+    "    potter_raw = potter_raw.replace(token, ' {} '.format(replacement))\n",
+    "corpus_raw = potter_raw.lower()\n",
+    "corpus_raw = potter_raw.split()\n",
+    "\n",
+    "vocab_to_int, int_to_vocab = lookup_tables(potter_raw)\n",
+    "potter_int = [vocab_to_int[word] for word in potter_raw]\n",
+    "pickle.dump((potter_int, vocab_to_int, int_to_vocab, token_dict), open('preprocess.p', 'wb'))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Building the network"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Batching the data "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def get_batches(int_text, batch_size, seq_length):\n",
+    "    words_per_batch = batch_size*seq_length\n",
+    "    num_batches = len(int_text)//words_per_batch\n",
+    "    int_text = int_text[:num_batches*words_per_batch]\n",
+    "    y = np.array(int_text[1:] + [int_text[0]])\n",
+    "    x = np.array(int_text)\n",
+    "    \n",
+    "    x_batches = np.split(x.reshape(batch_size, -1), num_batches, axis=1)\n",
+    "    y_batches = np.split(y.reshape(batch_size, -1), num_batches, axis=1)\n",
+    "    \n",
+    "    batch_data = list(zip(x_batches, y_batches))\n",
+    "    \n",
+    "    return np.array(batch_data)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Set the hyperparameters "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "num_epochs = 10000\n",
+    "batch_size = 512\n",
+    "rnn_size = 512\n",
+    "num_layers = 3\n",
+    "keep_prob = 0.7\n",
+    "embed_dim = 512\n",
+    "seq_length = 30\n",
+    "learning_rate = 0.001\n",
+    "save_dir = './save'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Building the graph "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "train_graph = tf.Graph()\n",
+    "with train_graph.as_default():    \n",
+    "    \n",
+    "    # Initialize input placeholders\n",
+    "    input_text = tf.placeholder(tf.int32, [None, None], name='input')\n",
+    "    targets = tf.placeholder(tf.int32, [None, None], name='targets')\n",
+    "    lr = tf.placeholder(tf.float32, name='learning_rate')\n",
+    "    \n",
+    "    # Calculate text attributes\n",
+    "    vocab_size = len(int_to_vocab)\n",
+    "    input_text_shape = tf.shape(input_text)\n",
+    "    \n",
+    "    # Build the RNN cell\n",
+    "    lstm = tf.contrib.rnn.BasicLSTMCell(num_units=rnn_size)\n",
+    "    drop_cell = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)\n",
+    "    cell = tf.contrib.rnn.MultiRNNCell([drop_cell] * num_layers)\n",
+    "    \n",
+    "    # Set the initial state\n",
+    "    initial_state = cell.zero_state(input_text_shape[0], tf.float32)\n",
+    "    initial_state = tf.identity(initial_state, name='initial_state')\n",
+    "    \n",
+    "    # Create word embedding as input to RNN\n",
+    "    embed = tf.contrib.layers.embed_sequence(input_text, vocab_size, embed_dim)\n",
+    "    \n",
+    "    # Build RNN\n",
+    "    outputs, final_state = tf.nn.dynamic_rnn(cell, embed, dtype=tf.float32)\n",
+    "    final_state = tf.identity(final_state, name='final_state')\n",
+    "    \n",
+    "    # Take RNN output and make logits\n",
+    "    logits = tf.contrib.layers.fully_connected(outputs, vocab_size, activation_fn=None)\n",
+    "    \n",
+    "    # Calculate the probability of generating each word\n",
+    "    probs = tf.nn.softmax(logits, name='probs')\n",
+    "    \n",
+    "    # Define loss function\n",
+    "    cost = tf.contrib.seq2seq.sequence_loss(\n",
+    "        logits,\n",
+    "        targets,\n",
+    "        tf.ones([input_text_shape[0], input_text_shape[1]])\n",
+    "    )\n",
+    "    \n",
+    "    # Learning rate optimizer\n",
+    "    optimizer = tf.train.AdamOptimizer(learning_rate)\n",
+    "    \n",
+    "    # Gradient clipping to avoid exploding gradients\n",
+    "    gradients = optimizer.compute_gradients(cost)\n",
+    "    capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]\n",
+    "    train_op = optimizer.apply_gradients(capped_gradients)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Train the network "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "\n",
+    "pickle.dump((seq_length, save_dir), open('params.p', 'wb'))\n",
+    "batches = get_batches(potter_int, batch_size, seq_length)\n",
+    "num_batches = len(batches)\n",
+    "start_time = time.time()\n",
+    "\n",
+    "with tf.Session(graph=train_graph) as sess:\n",
+    "    sess.run(tf.global_variables_initializer())\n",
+    "    \n",
+    "    for epoch in range(num_epochs):\n",
+    "        state = sess.run(initial_state, {input_text: batches[0][0]})\n",
+    "        \n",
+    "        for batch_index, (x, y) in enumerate(batches):\n",
+    "            feed_dict = {\n",
+    "                input_text: x,\n",
+    "                targets: y,\n",
+    "                initial_state: state,\n",
+    "                lr: learning_rate\n",
+    "            }\n",
+    "            train_loss, state, _ = sess.run([cost, final_state, train_op], feed_dict)\n",
+    "            \n",
+    "        time_elapsed = time.time() - start_time\n",
+    "        print('Epoch {:>3} Batch {:>4}/{}   train_loss = {:.3f}   time_elapsed = {:.3f}   time_remaining = {:.0f}'.format(\n",
+    "            epoch + 1,\n",
+    "            batch_index + 1,\n",
+    "            len(batches),\n",
+    "            train_loss,\n",
+    "            time_elapsed,\n",
+    "            ((num_batches * num_epochs)/((epoch + 1) * (batch_index + 1))) * time_elapsed - time_elapsed))\n",
+    "\n",
+    "        # save model every 10 epochs\n",
+    "        if epoch % 10 == 0:\n",
+    "            saver = tf.train.Saver()\n",
+    "            saver.save(sess, save_dir)\n",
+    "            print('Model Trained and Saved')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}