Lesson 2 - Capturing more of the syntax
In the previous lesson we saw how simple bag-of-words representations could be used to find similar documents. While it works somewhat well to find relevant documents, the model we use has a very simple representation of language, where all meaning derived from syntax are lost. We’ll now look at how can use pre-trained neural networks to get representations of text which capture some of this syntax.
Today, this is most often done by using Transformer neural networks pre-trained with language modelling. Essentially, the pretraining task is framed as learning the joint distribution over text by estimating the factorized distribution. This can be done in many ways (e.g. GPT, BERT, XLNet).
It has been noted that this pre-training task works well when later fine-tuning on some supervised task. In our case though, we would like to use some representation of the documents for similarity search, without doing any additional fine tuning.
To do this, we will use sentence BERT (sBERT), a variant of the BERT training procedure which strives to improve performance of the model for semantic representations.
Huggingface Transformers
Much of the community surrounding pre-trained language models has centered on a project named Hugginface Transformers. This started as a library of basic Transformer models (in particular including pretrained BERT and GPT models), but has grown to be a substantial platform for pre-trained models.
Huggingface makes working with these models simple, and hides much of the inner workings behind an easy to use interface.
import zipfile
from pathlib import Path
import urllib
data_url = "https://cdn.thingiverse.com/assets/d0/b3/68/63/1e/Gate_Guide_Spacer_v9.stl"
data_root = Path('data')
data_path = data_root / 'sampled_archive.zip'
data_root.mkdir(exist_ok=True)
from collections.abc import Sequence
from collections import defaultdict
import json
class ZipPatentCorpus:
def __init__(self, *, document_archive: Path, document_parts=('abstract', 'description', 'claims'), lang='en'):
self.document_archive = document_archive
self.document_zf = zipfile.ZipFile(self.document_archive)
self.document_parts = document_parts
self.lang = lang
self.documents = sorted(filename for filename in self.document_zf.namelist())
self.symbolic_labels = []
self.labeled_documents = defaultdict(list)
for document in self.documents:
label, sep, file = document.rpartition('/')
self.symbolic_labels.append(label)
self.labeled_documents[label].append(document)
self.label_codes = {label: i for i, label in enumerate(sorted(self.labeled_documents.keys()))}
self.labels = [self.label_codes[label] for label in self.symbolic_labels]
def __len__(self):
return len(self.documents)
def load_document(self, document_path):
with self.document_zf.open(document_path) as fp:
document = json.load(fp)
document_str = '\n'.join([document[part][self.lang] for part in self.document_parts])
return document_str
def __getitem__(self, item):
# Lazily load documents here
if isinstance(item, slice):
document_paths = self.documents[item]
document_str = [self.load_document(document_path) for document_path in document_paths]
elif isinstance(item, Sequence):
document_str = [self.load_document(self.documents[idx]) for idx in item]
else:
document_str = self.load_document(self.documents[item])
return document_str
def get_label(self, i):
return self.labels[i]
def get_symbolic_label(self, i):
return self.symbolic_labels[i]
import re
from collections import Counter
from tqdm import tqdm
class Tokenizer:
def __init__(self,
*,
max_vocab_size,
stoplist=('the', 'of', 'a', 'and', 'to', 'in', 'is', 'or', 'an', 'by', 'as', 'be', 'for'),
wordpattern=r"[A-Za-z0-9\-\+='.]*[A-Za-z][A-Za-z0-9\-\+='.]*"
):
self.max_vocab_size = max_vocab_size
self.stoplist = stoplist
self.wordpattern = re.compile(wordpattern)
def tokenize(self, text):
return [word.strip('.') for word in re.findall(self.wordpattern, text.lower())]
def encode(self, tokenized_text):
try:
term_to_index = self.term_to_index
except AttributeError:
raise RuntimeError("Tokenizer is missing term to index, did you call Tokenizer.fit() or Tokenizer.fit_transform()?")
return [term_to_index[term] for term in tokenized_text if term in term_to_index]
def decode(self, encoded_text):
try:
index_to_term = self.index_to_term
except AttributeError:
raise RuntimeError("Tokenizer is missing term to index, did you call Tokenizer.fit() or Tokenizer.fit_transform()?")
return [index_to_term[idx] for idx in encoded_text]
def make_vocab(self, documents_term_frequencies):
document_occurance_counts = Counter()
for document_term_frequency in documents_term_frequencies:
# And a count once for each unique term in a document
document_occurance_counts.update(document_term_frequency.keys())
for stopword in self.stoplist:
del document_occurance_counts[stopword]
self.vocabulary = sorted(term for term, count in document_occurance_counts.most_common(self.max_vocab_size) if count > 1)
self.term_to_index = {term: i for i, term in enumerate(self.vocabulary)}
self.index_to_term = {i: term for term, i in self.term_to_index.items()}
def fit(self, corpus):
documents_term_frequencies = [Counter(self.tokenize(doc)) for doc in tqdm(corpus, desc="Tokenizing", leave=False)]
self.make_vocab(documents_term_frequencies)
def fit_transform(self, corpus):
tokenized_docs = [self.tokenize(doc) for doc in tqdm(corpus, desc="Tokenizing", leave=False)]
documents_term_frequencies = [Counter(tokens) for tokens in tokenized_docs]
self.make_vocab(documents_term_frequencies)
return [self.encode(tokenized_text) for tokenized_text in tqdm(tokenized_docs, desc="Encoding", leave=False)]
def transform(self, text):
tokenized_text = self.tokenize(text)
encoded_text = self.encode(tokenized_text)
return encoded_text
def __len__(self):
return len(self.vocabulary)
import re
from collections import Counter
from tqdm import tqdm
class NGramTokenizer(Tokenizer):
def __init__(self,
*,
n,
**kwargs
):
self.n = n
super().__init__(**kwargs)
def fit(self, corpus):
documents_term_frequencies = []
for doc in tqdm(corpus, desc="Tokenizing", leave=False):
tokenized = self.tokenize(doc)
if not self.include_stop_ngrams:
tokenized = [token for token in tokenized if token not in self.stoplist]
document_terms = Counter(tokenized)
for n in range(1, self.n): # note that since we use the n in the slice below, for 2-grams we want this offset to be 1 and so on
n_grams = [' '.join(tokenized[i:i+n]) for i in range(len(tokenized))-1]
document_terms.update(n_grams)
documents_term_frequencies.append(document_terms)
self.make_vocab(documents_term_frequencies)
def fit_transform(self, corpus):
tokenized_docs = []
documents_term_frequencies = []
for doc in tqdm(corpus, desc="Tokenizing", leave=False):
tokenized = self.tokenize(doc)
if not self.include_stop_ngrams:
tokenized = [token for token in tokenized if token not in self.stoplist]
document_terms = Counter(tokenized)
for n in range(1, self.n): # note that since we use the n in the slice below, for 2-grams we want this offset to be 1 and so on
n_grams = [' '.join(tokenized[i:i+n]) for i in range(len(tokenized))-1]
document_terms.update(n_grams)
documents_term_frequencies.append(document_terms)
self.make_vocab(documents_term_frequencies)
tokenized_docs = [self.tokenize(doc) for doc in tqdm(corpus, desc="Tokenizing", leave=False)]
documents_term_frequencies = [Counter(tokens) for tokens in tokenized_docs]
self.make_vocab(documents_term_frequencies)
return [self.encode(tokenized_text) for tokenized_text in tqdm(tokenized_docs, desc="Encoding", leave=False)]
def transform(self, text):
tokenized_text = self.tokenize(text)
encoded_text = self.encode(tokenized_text)
return encoded_text
def __len__(self):
return len(self.vocabulary)
text_corpus = ZipPatentCorpus(document_archive=data_path, document_parts=['abstract'])
tokenizer = Tokenizer(max_vocab_size=100000)
tokenized_docs = tokenizer.fit_transform(text_corpus)
N-gram models
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForMaskedLM
device = "cuda" if torch.cuda.is_available() else "cpu"
import importlib
importlib.reload(transformers)
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
<module 'transformers' from 'F:\\Anaconda\\envs\\enccs-nlp-workshop\\lib\\site-packages\\transformers\\__init__.py'>
from sentence_transformers import SentenceTransformer
sentences = ["This is an example sentence", "Each sentence is converted"]
model = SentenceTransformer('AI-Growth-Lab/PatentSBERTa', device=device)
embeddings = model.encode(sentences)
print(embeddings)
---------------------------------------------------------------------------
ImportError Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_38868\1401220064.py in <module>
2 sentences = ["This is an example sentence", "Each sentence is converted"]
3
----> 4 model = SentenceTransformer('AI-Growth-Lab/PatentSBERTa', device=device)
5 embeddings = model.encode(sentences)
6 print(embeddings)
F:\Anaconda\envs\enccs-nlp-workshop\lib\site-packages\sentence_transformers\SentenceTransformer.py in __init__(self, model_name_or_path, modules, device, cache_folder)
88
89 if os.path.exists(os.path.join(model_path, 'modules.json')): #Load as SentenceTransformer model
---> 90 modules = self._load_sbert_model(model_path)
91 else: #Load with AutoModel
92 modules = self._load_auto_model(model_path)
F:\Anaconda\envs\enccs-nlp-workshop\lib\site-packages\sentence_transformers\SentenceTransformer.py in _load_sbert_model(self, model_path)
820 for module_config in modules_config:
821 module_class = import_from_string(module_config['type'])
--> 822 module = module_class.load(os.path.join(model_path, module_config['path']))
823 modules[module_config['name']] = module
824
F:\Anaconda\envs\enccs-nlp-workshop\lib\site-packages\sentence_transformers\models\Transformer.py in load(input_path)
122 with open(sbert_config_path) as fIn:
123 config = json.load(fIn)
--> 124 return Transformer(model_name_or_path=input_path, **config)
125
126
F:\Anaconda\envs\enccs-nlp-workshop\lib\site-packages\sentence_transformers\models\Transformer.py in __init__(self, model_name_or_path, max_seq_length, model_args, cache_dir, tokenizer_args, do_lower_case, tokenizer_name_or_path)
27
28 config = AutoConfig.from_pretrained(model_name_or_path, **model_args, cache_dir=cache_dir)
---> 29 self.auto_model = AutoModel.from_pretrained(model_name_or_path, config=config, cache_dir=cache_dir)
30 self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path if tokenizer_name_or_path is not None else model_name_or_path, cache_dir=cache_dir, **tokenizer_args)
31
F:\Anaconda\envs\enccs-nlp-workshop\lib\site-packages\transformers\utils\dummy_pt_objects.py in from_pretrained(self, *args, **kwargs)
365 @classmethod
366 def from_pretrained(self, *args, **kwargs):
--> 367 requires_backends(self, ["torch"])
368
369
F:\Anaconda\envs\enccs-nlp-workshop\lib\site-packages\transformers\file_utils.py in requires_backends(obj, backends)
567 name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
568 if not all(BACKENDS_MAPPING[backend][0]() for backend in backends):
--> 569 raise ImportError("".join([BACKENDS_MAPPING[backend][1].format(name) for backend in backends]))
570
571
ImportError:
AutoModel requires the PyTorch library but it was not found in your environment. Checkout the instructions on the
installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.