From 7b3d15da18a24fdae650bf222bb9340e3fd39492 Mon Sep 17 00:00:00 2001
From: Dmitri Soshnikov <dmitri@soshnikov.com>
Date: Wed, 25 May 2022 00:40:51 +0300
Subject: [PATCH] Fix torchtext issue

---
 .devcontainer/devcontainer.json               |   2 +
 .devcontainer/requirements.txt                |   5 +-
 README.md                                     |   1 +
 binder/requirements.txt                       |   5 +-
 lessons/5-NLP/14-Embeddings/torchnlp.py       |   4 +-
 lessons/5-NLP/16-RNN/torchnlp.py              |   4 +-
 .../GenerativePyTorch.ipynb                   | 103 ++++++++---------
 .../17-GenerativeNetworks/GenerativeTF.ipynb  |   9 +-
 .../5-NLP/17-GenerativeNetworks/torchnlp.py   | 104 ++++++++++++++++++
 9 files changed, 166 insertions(+), 71 deletions(-)
 create mode 100644 lessons/5-NLP/17-GenerativeNetworks/torchnlp.py

diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
index 2f8841c..5c9a324 100644
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -8,6 +8,8 @@
 		}
 	},
 
+	"hostRequirements": { "cpus": 4, "memory": "8gb", "storage": "100gb"},
+
 	// Configure tool-specific properties.
 	"customizations": {
 		// Configure properties specific to VS Code.
diff --git a/.devcontainer/requirements.txt b/.devcontainer/requirements.txt
index 36f009a..935d6db 100644
--- a/.devcontainer/requirements.txt
+++ b/.devcontainer/requirements.txt
@@ -21,7 +21,8 @@ tokenizers==0.10.3
 torch==1.11.0
 torchaudio
 torchinfo
-torchtext
-torchvision
+torchtext==0.12.0
+torchvision==0.12.0
+torchdata
 tqdm==4.62.3
 transformers==4.3.3
diff --git a/README.md b/README.md
index e54db33..ced7a3b 100644
--- a/README.md
+++ b/README.md
@@ -7,6 +7,7 @@
 [![GitHub watchers](https://img.shields.io/github/watchers/microsoft/AI-For-Beginners.svg?style=social&label=Watch)](https://GitHub.com/microsoft/AI-For-Beginners/watchers/)
 [![GitHub forks](https://img.shields.io/github/forks/microsoft/AI-For-Beginners.svg?style=social&label=Fork)](https://GitHub.com/microsoft/AI-For-Beginners/network/)
 [![GitHub stars](https://img.shields.io/github/stars/microsoft/AI-For-Beginners.svg?style=social&label=Star)](https://GitHub.com/microsoft/AI-For-Beginners/stargazers/)
+[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/microsoft/ai-for-beginners/HEAD)
 
 # Artificial Intelligence for Beginners - A Curriculum
 
diff --git a/binder/requirements.txt b/binder/requirements.txt
index 36f009a..935d6db 100644
--- a/binder/requirements.txt
+++ b/binder/requirements.txt
@@ -21,7 +21,8 @@ tokenizers==0.10.3
 torch==1.11.0
 torchaudio
 torchinfo
-torchtext
-torchvision
+torchtext==0.12.0
+torchvision==0.12.0
+torchdata
 tqdm==4.62.3
 transformers==4.3.3
diff --git a/lessons/5-NLP/14-Embeddings/torchnlp.py b/lessons/5-NLP/14-Embeddings/torchnlp.py
index d6ca5e0..cd709f0 100644
--- a/lessons/5-NLP/14-Embeddings/torchnlp.py
+++ b/lessons/5-NLP/14-Embeddings/torchnlp.py
@@ -20,12 +20,12 @@ def load_dataset(ngrams=1,min_freq=1):
     counter = collections.Counter()
     for (label, line) in train_dataset:
         counter.update(torchtext.data.utils.ngrams_iterator(tokenizer(line),ngrams=ngrams))
-    vocab = torchtext.vocab.Vocab(counter, min_freq=min_freq)
+    vocab = torchtext.vocab.vocab(counter, min_freq=min_freq)
     return train_dataset,test_dataset,classes,vocab
 
 def encode(x,voc=None,unk=0,tokenizer=tokenizer):
     v = vocab if voc is None else voc
-    return [v.stoi.get(s,unk) for s in tokenizer(x)]
+    return [v.get_stoi().get(s,unk) for s in tokenizer(x)]
 
 def train_epoch(net,dataloader,lr=0.01,optimizer=None,loss_fn = torch.nn.CrossEntropyLoss(),epoch_size=None, report_freq=200):
     optimizer = optimizer or torch.optim.Adam(net.parameters(),lr=lr)
diff --git a/lessons/5-NLP/16-RNN/torchnlp.py b/lessons/5-NLP/16-RNN/torchnlp.py
index d6ca5e0..cd709f0 100644
--- a/lessons/5-NLP/16-RNN/torchnlp.py
+++ b/lessons/5-NLP/16-RNN/torchnlp.py
@@ -20,12 +20,12 @@ def load_dataset(ngrams=1,min_freq=1):
     counter = collections.Counter()
     for (label, line) in train_dataset:
         counter.update(torchtext.data.utils.ngrams_iterator(tokenizer(line),ngrams=ngrams))
-    vocab = torchtext.vocab.Vocab(counter, min_freq=min_freq)
+    vocab = torchtext.vocab.vocab(counter, min_freq=min_freq)
     return train_dataset,test_dataset,classes,vocab
 
 def encode(x,voc=None,unk=0,tokenizer=tokenizer):
     v = vocab if voc is None else voc
-    return [v.stoi.get(s,unk) for s in tokenizer(x)]
+    return [v.get_stoi().get(s,unk) for s in tokenizer(x)]
 
 def train_epoch(net,dataloader,lr=0.01,optimizer=None,loss_fn = torch.nn.CrossEntropyLoss(),epoch_size=None, report_freq=200):
     optimizer = optimizer or torch.optim.Adam(net.parameters(),lr=lr)
diff --git a/lessons/5-NLP/17-GenerativeNetworks/GenerativePyTorch.ipynb b/lessons/5-NLP/17-GenerativeNetworks/GenerativePyTorch.ipynb
index 04a069d..8420b35 100644
--- a/lessons/5-NLP/17-GenerativeNetworks/GenerativePyTorch.ipynb
+++ b/lessons/5-NLP/17-GenerativeNetworks/GenerativePyTorch.ipynb
@@ -53,9 +53,9 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Vocabulary size = 84\n",
-      "Encoding of 'a' is 4\n",
-      "Character with code 13 is h\n"
+      "Vocabulary size = 82\n",
+      "Encoding of 'a' is 1\n",
+      "Character with code 13 is c\n"
      ]
     }
    ],
@@ -66,12 +66,12 @@
     "counter = collections.Counter()\n",
     "for (label, line) in train_dataset:\n",
     "    counter.update(char_tokenizer(line))\n",
-    "vocab = torchtext.vocab.Vocab(counter)\n",
+    "vocab = torchtext.vocab.vocab(counter)\n",
     "\n",
     "vocab_size = len(vocab)\n",
     "print(f\"Vocabulary size = {vocab_size}\")\n",
-    "print(f\"Encoding of 'a' is {vocab.stoi['a']}\")\n",
-    "print(f\"Character with code 13 is {vocab.itos[13]}\")"
+    "print(f\"Encoding of 'a' is {vocab.get_stoi()['a']}\")\n",
+    "print(f\"Character with code 13 is {vocab.get_itos()[13]}\")"
    ]
   },
   {
@@ -83,23 +83,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "tensor([43,  4, 11, 11,  2, 26,  5, 23,  2, 38,  3,  4, 10,  9,  2, 31, 11,  4,\n",
-       "        21,  2, 38,  4, 14, 25,  2, 34,  8,  5,  6,  2,  5, 13,  3,  2, 38, 11,\n",
-       "         4, 14, 25,  2, 55, 37,  3, 15,  5,  3, 10,  9, 56,  2, 37,  3, 15,  5,\n",
-       "         3, 10,  9,  2, 29,  2, 26, 13,  6, 10,  5, 29,  9,  3, 11, 11,  3, 10,\n",
-       "         9, 27,  2, 43,  4, 11, 11,  2, 26,  5, 10,  3,  3,  5, 58,  9,  2, 12,\n",
-       "        21,  7,  8, 12, 11,  7,  8, 18, 61, 22,  4,  8, 12,  2,  6, 19,  2, 15,\n",
-       "        11,  5, 10,  4, 29, 14, 20,  8,  7, 14,  9, 27,  2,  4, 10,  3,  2,  9,\n",
-       "         3,  3,  7,  8, 18,  2, 18, 10,  3,  3,  8,  2,  4, 18,  4,  7,  8, 23])"
+       "tensor([ 0,  1,  2,  2,  3,  4,  5,  6,  3,  7,  8,  1,  9, 10,  3, 11,  2,  1,\n",
+       "        12,  3,  7,  1, 13, 14,  3, 15, 16,  5, 17,  3,  5, 18,  8,  3,  7,  2,\n",
+       "         1, 13, 14,  3, 19, 20,  8, 21,  5,  8,  9, 10, 22,  3, 20,  8, 21,  5,\n",
+       "         8,  9, 10,  3, 23,  3,  4, 18, 17,  9,  5, 23, 10,  8,  2,  2,  8,  9,\n",
+       "        10, 24,  3,  0,  1,  2,  2,  3,  4,  5,  9,  8,  8,  5, 25, 10,  3, 26,\n",
+       "        12, 27, 16, 26,  2, 27, 16, 28, 29, 30,  1, 16, 26,  3, 17, 31,  3, 21,\n",
+       "         2,  5,  9,  1, 23, 13, 32, 16, 27, 13, 10, 24,  3,  1,  9,  8,  3, 10,\n",
+       "         8,  8, 27, 16, 28,  3, 28,  9,  8,  8, 16,  3,  1, 28,  1, 27, 16,  6])"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -128,29 +128,29 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "(tensor([[43,  4, 11,  ..., 18, 61, 22],\n",
-       "         [ 4, 11, 11,  ..., 61, 22,  4],\n",
-       "         [11, 11,  2,  ..., 22,  4,  8],\n",
+       "(tensor([[ 0,  1,  2,  ..., 28, 29, 30],\n",
+       "         [ 1,  2,  2,  ..., 29, 30,  1],\n",
+       "         [ 2,  2,  3,  ..., 30,  1, 16],\n",
        "         ...,\n",
-       "         [37,  3, 15,  ...,  4, 18,  4],\n",
-       "         [ 3, 15,  5,  ..., 18,  4,  7],\n",
-       "         [15,  5,  3,  ...,  4,  7,  8]], device='cuda:0'),\n",
-       " tensor([[ 4, 11, 11,  ..., 61, 22,  4],\n",
-       "         [11, 11,  2,  ..., 22,  4,  8],\n",
-       "         [11,  2, 26,  ...,  4,  8, 12],\n",
+       "         [20,  8, 21,  ...,  1, 28,  1],\n",
+       "         [ 8, 21,  5,  ..., 28,  1, 27],\n",
+       "         [21,  5,  8,  ...,  1, 27, 16]]),\n",
+       " tensor([[ 1,  2,  2,  ..., 29, 30,  1],\n",
+       "         [ 2,  2,  3,  ..., 30,  1, 16],\n",
+       "         [ 2,  3,  4,  ...,  1, 16, 26],\n",
        "         ...,\n",
-       "         [ 3, 15,  5,  ..., 18,  4,  7],\n",
-       "         [15,  5,  3,  ...,  4,  7,  8],\n",
-       "         [ 5,  3, 10,  ...,  7,  8, 23]], device='cuda:0'))"
+       "         [ 8, 21,  5,  ..., 28,  1, 27],\n",
+       "         [21,  5,  8,  ...,  1, 27, 16],\n",
+       "         [ 5,  8,  9,  ..., 27, 16,  6]]))"
       ]
      },
-     "execution_count": 8,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -180,7 +180,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -207,7 +207,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -216,7 +216,7 @@
     "        out, s = net(enc(chars).view(1,-1).to(device))\n",
     "        for i in range(size):\n",
     "            nc = torch.argmax(out[0][-1])\n",
-    "            chars.append(vocab.itos[nc])\n",
+    "            chars.append(vocab.get_itos()[nc])\n",
     "            out, s = net(nc.view(1,-1),s)\n",
     "        return ''.join(chars)"
    ]
@@ -234,35 +234,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Current loss = 4.442246913909912\n",
-      "today ggrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg\n",
-      "Current loss = 2.1178359985351562\n",
-      "today and a could a the to the to the to the to the to the to the to the to the to the to the to the to th\n",
-      "Current loss = 1.6465336084365845\n",
-      "today on Tuesday the company to the United States and a policing to the United States and a policing to th\n",
-      "Current loss = 2.3716814517974854\n",
-      "today to the United States and a new men to the United States and a new men to the United States and a new\n",
-      "Current loss = 1.6844098567962646\n",
-      "today of the first the first the first the first the first the first the first the first the first the fir\n",
-      "Current loss = 1.702707052230835\n",
-      "today of the United States a said the United States a said the United States a said the United States a sa\n",
-      "Current loss = 1.9633255004882812\n",
-      "today of the first the first the first the first the first the first the first the first the first the fir\n",
-      "Current loss = 1.8642014265060425\n",
-      "today of the second a second a second a second a second a second a second a second a second a second a sec\n",
-      "Current loss = 1.7720613479614258\n",
-      "today and and and the company of the company of the company of the company of the company of the company o\n",
-      "Current loss = 1.52818763256073\n",
-      "today and the company of the company of the company of the company of the company of the company of the co\n",
-      "Current loss = 1.5444810390472412\n",
-      "today and the counters to the first the counters to the first the counters to the first the counters to th\n"
+      "Current loss = 4.398899078369141\n",
+      "today sr sr sr sr sr sr sr sr sr sr sr sr sr sr sr sr sr sr sr sr sr sr sr sr sr sr sr sr sr sr sr sr sr s\n"
      ]
     }
    ],
@@ -355,7 +335,7 @@
     "            #nc = torch.argmax(out[0][-1])\n",
     "            out_dist = out[0][-1].div(temperature).exp()\n",
     "            nc = torch.multinomial(out_dist,1)[0]\n",
-    "            chars.append(vocab.itos[nc])\n",
+    "            chars.append(vocab.get_itos()[nc])\n",
     "            out, s = net(nc.view(1,-1),s)\n",
     "        return ''.join(chars)\n",
     "    \n",
@@ -372,10 +352,13 @@
   }
  ],
  "metadata": {
+  "interpreter": {
+   "hash": "16af2a8bbb083ea23e5e41c7f5787656b2ce26968575d8763f2c4b17f9cd711f"
+  },
   "kernelspec": {
-   "display_name": "py37_pytorch",
+   "display_name": "Python 3.8.12 ('py38')",
    "language": "python",
-   "name": "conda-env-py37_pytorch-py"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -387,7 +370,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.7"
+   "version": "3.8.12"
   }
  },
  "nbformat": 4,
diff --git a/lessons/5-NLP/17-GenerativeNetworks/GenerativeTF.ipynb b/lessons/5-NLP/17-GenerativeNetworks/GenerativeTF.ipynb
index 2b2e95b..67ebf03 100644
--- a/lessons/5-NLP/17-GenerativeNetworks/GenerativeTF.ipynb
+++ b/lessons/5-NLP/17-GenerativeNetworks/GenerativeTF.ipynb
@@ -455,10 +455,13 @@
   }
  ],
  "metadata": {
+  "interpreter": {
+   "hash": "16af2a8bbb083ea23e5e41c7f5787656b2ce26968575d8763f2c4b17f9cd711f"
+  },
   "kernelspec": {
-   "display_name": "py38_tensorflow",
+   "display_name": "Python 3.8.12 ('py38')",
    "language": "python",
-   "name": "conda-env-py38_tensorflow-py"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -470,7 +473,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.8.12"
   }
  },
  "nbformat": 4,
diff --git a/lessons/5-NLP/17-GenerativeNetworks/torchnlp.py b/lessons/5-NLP/17-GenerativeNetworks/torchnlp.py
new file mode 100644
index 0000000..cd709f0
--- /dev/null
+++ b/lessons/5-NLP/17-GenerativeNetworks/torchnlp.py
@@ -0,0 +1,104 @@
+import builtins
+import torch
+import torchtext
+import collections
+import os
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+vocab = None
+tokenizer = torchtext.data.utils.get_tokenizer('basic_english')
+
+def load_dataset(ngrams=1,min_freq=1):
+    global vocab, tokenizer
+    print("Loading dataset...")
+    train_dataset, test_dataset = torchtext.datasets.AG_NEWS(root='./data')
+    train_dataset = list(train_dataset)
+    test_dataset = list(test_dataset)
+    classes = ['World', 'Sports', 'Business', 'Sci/Tech']
+    print('Building vocab...')
+    counter = collections.Counter()
+    for (label, line) in train_dataset:
+        counter.update(torchtext.data.utils.ngrams_iterator(tokenizer(line),ngrams=ngrams))
+    vocab = torchtext.vocab.vocab(counter, min_freq=min_freq)
+    return train_dataset,test_dataset,classes,vocab
+
+def encode(x,voc=None,unk=0,tokenizer=tokenizer):
+    v = vocab if voc is None else voc
+    return [v.get_stoi().get(s,unk) for s in tokenizer(x)]
+
+def train_epoch(net,dataloader,lr=0.01,optimizer=None,loss_fn = torch.nn.CrossEntropyLoss(),epoch_size=None, report_freq=200):
+    optimizer = optimizer or torch.optim.Adam(net.parameters(),lr=lr)
+    loss_fn = loss_fn.to(device)
+    net.train()
+    total_loss,acc,count,i = 0,0,0,0
+    for labels,features in dataloader:
+        optimizer.zero_grad()
+        features, labels = features.to(device), labels.to(device)
+        out = net(features)
+        loss = loss_fn(out,labels) #cross_entropy(out,labels)
+        loss.backward()
+        optimizer.step()
+        total_loss+=loss
+        _,predicted = torch.max(out,1)
+        acc+=(predicted==labels).sum()
+        count+=len(labels)
+        i+=1
+        if i%report_freq==0:
+            print(f"{count}: acc={acc.item()/count}")
+        if epoch_size and count>epoch_size:
+            break
+    return total_loss.item()/count, acc.item()/count
+
+def padify(b,voc=None,tokenizer=tokenizer):
+    # b is the list of tuples of length batch_size
+    #   - first element of a tuple = label, 
+    #   - second = feature (text sequence)
+    # build vectorized sequence
+    v = [encode(x[1],voc=voc,tokenizer=tokenizer) for x in b]
+    # compute max length of a sequence in this minibatch
+    l = max(map(len,v))
+    return ( # tuple of two tensors - labels and features
+        torch.LongTensor([t[0]-1 for t in b]),
+        torch.stack([torch.nn.functional.pad(torch.tensor(t),(0,l-len(t)),mode='constant',value=0) for t in v])
+    )
+
+def offsetify(b,voc=None):
+    # first, compute data tensor from all sequences
+    x = [torch.tensor(encode(t[1],voc=voc)) for t in b]
+    # now, compute the offsets by accumulating the tensor of sequence lengths
+    o = [0] + [len(t) for t in x]
+    o = torch.tensor(o[:-1]).cumsum(dim=0)
+    return ( 
+        torch.LongTensor([t[0]-1 for t in b]), # labels
+        torch.cat(x), # text 
+        o
+    )
+
+def train_epoch_emb(net,dataloader,lr=0.01,optimizer=None,loss_fn = torch.nn.CrossEntropyLoss(),epoch_size=None, report_freq=200,use_pack_sequence=False):
+    optimizer = optimizer or torch.optim.Adam(net.parameters(),lr=lr)
+    loss_fn = loss_fn.to(device)
+    net.train()
+    total_loss,acc,count,i = 0,0,0,0
+    for labels,text,off in dataloader:
+        optimizer.zero_grad()
+        labels,text = labels.to(device), text.to(device)
+        if use_pack_sequence:
+            off = off.to('cpu')
+        else:
+            off = off.to(device)
+        out = net(text, off)
+        loss = loss_fn(out,labels) #cross_entropy(out,labels)
+        loss.backward()
+        optimizer.step()
+        total_loss+=loss
+        _,predicted = torch.max(out,1)
+        acc+=(predicted==labels).sum()
+        count+=len(labels)
+        i+=1
+        if i%report_freq==0:
+            print(f"{count}: acc={acc.item()/count}")
+        if epoch_size and count>epoch_size:
+            break
+    return total_loss.item()/count, acc.item()/count
+
-- 
GitLab