#pip install PyPDF2
import pandas as pd
import matplotlib.pyplot as plt
from PyPDF2 import PdfFileReader
import PyPDF2
import re
# Listado de todos los archivos en el directorio
import os
from os import listdir
from os.path import isfile, join
print(os.getcwd())
C:\Users\suzak\DataCamp\Proyecto
# Listado de todos los archivos en el directorio
#
from os import listdir
from os.path import isfile, join
#Donde mypath es la dirección donde están guardados los artículos
mypath='C:/Users/suzak/DataCamp/Proyecto/inglés_txt'
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
onlyfiles[0]
'A Comparative Analysis on Smart Farming Techniques using Internet of Things (IoT).txt'
num_paper=len(listdir(mypath))
num_paper
70
#crear un diccionario de artículos
articulos=[{}]
def llenar_diccionario(text,nombre_archivo):
articulo={}
text = re.sub('\n', ' ',text)
text = re.sub("METHODOLGY","METHODOLOGY",text)
articulo['Title']=nombre_archivo
##Extrayendo el abstract
txt_abstract(r"Abstract","Keywords",text,articulo)
txt_abstract(r"ABSTRACT","KEYWORDS",text,articulo)
txt_abstract(r"Abstract","KEYWORDS",text,articulo)
txt_abstract(r"Abstract","Index Terms",text,articulo)
txt_abstract(r"Abstract","Key Words",text,articulo)
txt_abstract(r"ABSTRACT","Keywords",text,articulo)
txt_abstract(r"Abstract","Key words",text,articulo)
##Extrayendo las keywords
txt_keywords(r"Keywords","Introduction",text,articulo)
txt_keywords(r"Keywords","INTRODUCTION",text,articulo)
txt_keywords(r"Keyword","Introduction",text,articulo)
txt_keywords(r"Key words","Introduction",text,articulo)
txt_keywords(r"KEYWORDS","INTRODUCTION",text,articulo)
txt_keywords(r"Index Terms","Introduction",text,articulo)
txt_keywords(r"Index Terms","INTRODUCTION",text,articulo)
txt_keywords(r"Key word","Introduction",text,articulo)
txt_keywords(r"Key Words","INTRODUCTION",text,articulo)
##Extrayendo la introducción
txt_introduction(r"Introduction","LITERATURE",text,articulo)
txt_introduction(r"INTRODUCTION","LITERATURE",text,articulo)
txt_introduction(r"INTRODUCTION","RESULTS",text,articulo)
txt_introduction(r"INTRODUCTION","RELATED WORKS",text,articulo)
txt_introduction(r"INTRODUCTION","Related Work",text,articulo)
txt_introduction(r"INTRODUCTION","IRRIGATION SYSTEM",text,articulo)
txt_introduction(r"INTRODUCTION","PROPOSED",text,articulo)
txt_introduction(r"INTRODUCTION","Proposed Network Architecture",text,articulo)
txt_introduction(r"INTRODUCTION","Fuzzy Logic System",text,articulo)
txt_introduction(r"Introduction","Materials",text,articulo)
txt_introduction(r"Introduction","Literature Review",text,articulo)
txt_introduction(r"Introduction","Related Work",text,articulo)
txt_introduction(r"Introduction","Methodology",text,articulo)
txt_introduction(r"Introduction","METHODOLOGY",text,articulo)
txt_introduction(r"Introduction","Project Description",text,articulo)
txt_introduction(r"Introduction","AgriPrediction model",text,articulo)
txt_introduction(r"Introduction","Proposed Work",text,articulo)
txt_introduction(r"Introduction","Proposed Irrigation System",text,articulo)
txt_introduction(r"Introduction","System description",text,articulo)
txt_introduction(r"Introduction","CONSTRUCTION",text,articulo)
txt_introduction(r"Introduction","OVERVIEW",text,articulo)
txt_introduction(r"Introduction","System design",text,articulo)
txt_introduction(r"Introduction","Proposed Solution",text,articulo)
txt_introduction(r"Introduction and Motivation","Proposed Irrigation System",text,articulo)
txt_introduction(r"Introduction","Design of the Agronomic System",text,articulo)
txt_introduction(r"Introduction","Design of IoT",text,articulo)
txt_introduction(r"Introduction","Types of sensors:",text,articulo)
txt_introduction(r"Introduction","Remote sensing systems",text,articulo)
txt_introduction(r"Introduction","Proposed System",text,articulo)
txt_introduction(r"Introduction","Irrigation decision-making",text,articulo)
##Extrayendo la metodología
txt_methodology(r"IRRIGATION SYSTEM",r"CONCLUSION",text,articulo)
txt_methodology(r"Irrigation decision-making",r"Conclusions",text,articulo)
txt_methodology(r"Materials and Methods",r"Conclusions",text,articulo)
txt_methodology(r"Fertigation System’s Description",r"Conclusions",text,articulo)
txt_methodology(r"Proposed Irrigation",r"Future Scope",text,articulo)
txt_methodology(r"Rule-Based Preparation Work",r"Conclusions",text,articulo)
txt_methodology(r"Project Description",r"Acknowledgment",text,articulo)
txt_methodology(r"System description",r"Conclusion",text,articulo)
txt_methodology(r"Design of AgriSens",r"Conclusion",text,articulo)
txt_methodology(r"AgriPrediction model",r"Conclusion",text,articulo)
txt_methodology(r"Proposed System",r"Conclusion",text,articulo)
txt_methodology(r"Proposed Solution",r"Conclusions",text,articulo)
txt_methodology(r"Proposed Work",r"Conclusion",text,articulo)
txt_methodology(r"Proposed Framework",r"Conclusion",text,articulo)
txt_methodology(r"Components and technologies of the architecture",r"Conclusion",text,articulo)
txt_methodology(r"Multimethod Approach",r"Discussion",text,articulo)
txt_methodology(r"Materials and methods",r"Results",text,articulo)
txt_methodology(r"Materials and methods",r"Discussion",text,articulo)
txt_methodology(r"Problem Statement",r"Results",text,articulo)
txt_methodology(r"PROPOSED",r"RESULTS",text,articulo)
txt_methodology(r"PROPOSED",r"CONCLUSION",text,articulo)
txt_methodology(r"ARCHITECTURE",r"Results",text,articulo)
txt_methodology(r"SYSTEM",r"RESULTS",text,articulo)
txt_methodology(r"System design",r"Conclusion",text,articulo)
txt_methodology(r"ARCHITECTURE",r"CONCLUSIONS",text,articulo)
txt_methodology(r"DESIGN",r"CONCLUSION",text,articulo)
txt_methodology(r"PROJECT DESCRIPTION",r"ACKNOWLEDGMENT",text,articulo)
txt_methodology(r"ENABLING IOT TECHNOLOGIES",r"CONCLUSION",text,articulo)
txt_methodology(r"PROTOTYPE IOT-BASED",r"CONCLUSION",text,articulo)
txt_methodology(r"METHODOLOGY",r"RESULTS",text,articulo)
txt_methodology(r"Methodology",r"Results",text,articulo)
txt_methodology(r"METHODOLOGY",r"Conclusions",text,articulo)
txt_methodology(r"Methodology",r"Conclusions",text,articulo)
txt_methodology(r"Working and modelling",r"Conclusions",text,articulo)
txt_methodology(r"MATERIALS AND METHODS",r"RESULTS",text,articulo)
txt_methodology(r'Materials and Methods',r'Results',text,articulo)
txt_methodology(r'Methodology',r'References',text,articulo)
txt_methodology(r'Matherial and methods',r'Conclusion',text,articulo)
txt_methodology(r'Working and modelling',r'Conclusion and future scope',text,articulo)
txt_methodology(r'Proposed Work',r'Experimental Results and Analysis',text,articulo)
txt_methodology(r'Design Aspects and challenges in Rural Areas',r'Conclusion',text,articulo)
txt_methodology(r'Matherials and methods',r'Conclusion',text,articulo)
txt_methodology(r'Multimethod Approach',r'Discussion',text,articulo)
txt_methodology(r'Design of the Agronomic System',r'Conclusions',text,articulo)
txt_methodology(r'Water Management',r'Discussion',text,articulo)
txt_methodology(r'Contribution',r'Conclusion',text,articulo)
txt_methodology(r'Proposed IoT Sensing Platform',r'Conclusions and Future Work',text,articulo)
txt_methodology(r'PROPOSED MODEL',r'Conclusion',text,articulo)
txt_methodology(r'Enabling IoT Technologies',r'Conclusion',text,articulo)
txt_methodology(r'OBJECTIVES',r'Conclusion',text,articulo)
txt_methodology(r'Proposed Platform',r'Conclusion and Future Work',text,articulo)
txt_methodology(r'METHODOLOGY',r'CONCLUSION',text,articulo)
txt_methodology(r'INTELLIGENT STRATEGIES FOR SUSTAINABLE',r'CONCLUSION',text,articulo)
txt_methodology(r'System Design',r'Conclusion',text,articulo)
txt_methodology(r'Contributions of the Current Paper',r'Conclusion and Future Works',text,articulo)
txt_methodology(r'Proposed system',r'Future work and conclusion',text,articulo)
txt_methodology(r'System Design',r'References',text,articulo)
##Extrayendo conclusiones
txt_conclusion(r"Conclusion",r"References",text,articulo)
txt_conclusion(r"Conclusion",r"Author",text,articulo)
txt_conclusion(r"Conclusion",r"Funding",text,articulo)
txt_conclusion(r"Conclusion",r"Acknowledgments",text,articulo)
txt_conclusion(r"Conclusion",r"REFERENCES",text,articulo)
txt_conclusion(r"CONCLUSION",r"REFERENCE",text,articulo)
txt_conclusion(r"RESULTS",r"References",text,articulo)
txt_conclusion(r"RESULTS",r"REFERENCES",text,articulo)
txt_conclusion(r"RESULTS",r"ACKNOWLEDGEMENTS",text,articulo)
txt_conclusion(r"Results and discussion",r"Conclusion",text,articulo)
txt_conclusion(r"Results and discussion",r"REFERENCES",text,articulo)
txt_conclusion(r"Future Scope",r"References",text,articulo)
txt_conclusion(r"CONCLUSION",r"ACKNOWLEDGMENT",text,articulo)
txt_conclusion(r"Results and discussions",r"Authors’ contributions",text,articulo)
txt_conclusion(r"Discussion",r"Authors’ contributions",text,articulo)
txt_conclusion(r"Discussion",r"Acknowledgement",text,articulo)
txt_conclusion(r"Discussion",r"References",text,articulo)
txt_conclusion(r"Discussion",r"Author Contributions",text,articulo)
##Extrayendo agradecimientos
txt_aknowledge(r"Acknowledgments",r"References",text,articulo)
txt_aknowledge(r"Acknowledgement",r"Appendix",text,articulo)
txt_aknowledge(r"ACKNOWLEDGEMENTS",r"REFERENCES",text,articulo)
txt_aknowledge(r"ACKNOWLEDGMENT",r"REFERENCES",text,articulo)
txt_aknowledge(r"Author Contributions",r"Conflicts of Interest",text,articulo)
txt_aknowledge(r"Authors’ contributions",r"References",text,articulo)
txt_aknowledge(r"Funding",r"References",text,articulo)
txt_aknowledge(r"Author",r"References",text,articulo)
agregar_articulo(articulo)
def txt_abstract(texto1,texto2,text,articulo):
if re.search(texto1, text):
if re.search(texto2, text):
articulo['Abstract']=re.findall(texto1+'(.*?)'+texto2, text)
def txt_keywords(texto1,texto2,text,articulo):
if re.search(texto1, text):
if re.search(texto2, text):
articulo['Keywords']=re.findall(texto1+'(.*?)'+texto2, text)
def txt_introduction(texto1,texto2,text,articulo):
if re.search(texto1, text):
if re.search(texto2, text):
articulo['Introduction']=re.findall(texto1+'(.*?)'+texto2, text)
def txt_methodology(texto1,texto2,text,articulo):
if re.search(texto1, text):
if re.search(texto2, text):
articulo['methodology']=re.findall(texto1+'(.*?)'+texto2, text)
def txt_conclusion(texto1,texto2,text,articulo):
if re.search(texto1, text):
if re.search(texto2, text):
articulo['Conclusion']=re.findall(texto1+'(.*?)'+texto2, text)
def txt_aknowledge(texto1,texto2,text,articulo):
if re.search(texto1, text):
if re.search(texto2, text):
articulo['Aknowledge']=re.findall(texto1+'(.*?)'+texto2, text)
def agregar_articulo(text):
articulos.append(text)
#https://naps.com.mx/blog/leer-archivos-en-python-por-linea-palabra/
#Unir elementos de lista: https://www.geeksforgeeks.org/python-merge-list-elements/
for i in range(len(listdir(mypath))-1):
#Extrayendo páginas
pdf_path='inglés_txt/'+onlyfiles[i]
datos = []
with open(pdf_path,encoding="UTF-8") as fname:
lineas = fname.readlines()
for linea in lineas:
datos.append(linea.strip(' '))
paper=''.join(datos[0:len(datos)])
llenar_diccionario(paper,onlyfiles[i])
#Crea un objeto en donde se depositara archivos
#pdf_path = 'inglés/1 Internet of Things (IoT) Based Water Irrigation System.pdf'
#pdf_path='Inglés/'+onlyfiles[0]
df = pd.DataFrame (articulos)
#Exportar de dataframe a excel
df.to_excel(r'Artículos_inglés.xlsx', index = False)
def pre_process(text):
abc=()
text=text.lower()
text = re.sub('\[.*?¿\]\%', ' ',text) #remover caracteres especiales
text = re.sub('[%s]' % re.escape(string.punctuation), ' ',text) #remover puntuación
text = re.sub('\w*\d\w*', '',text) #remover números entre palabras
text = re.sub(r'\b(\w+)( \1\b)+', r'\1', text) #remover palabras duplicadas consecutivas
text = re.sub('[^a-zA-Z]',' ', text) #remover alfabeto
text = re.sub(r',', '', text) # Remover comas
text = re.sub(r'-', '', text) # Remover comas
text = re.sub(r'\-', ' ', text) # Remover guiones
text = re.sub(r'á', 'a', text) # remover tildes
text = re.sub(r'é', 'e', text) # remover tildes
text = re.sub(r'í', 'i', text) # remover tildes
text = re.sub(r'ó', 'o', text) # remover tildes
text = re.sub(r'ú', 'u', text) # remover tildes
text = re.sub(r' ', ' ', text) # remover espacios
text = re.sub(r' ', ' ', text) # remover espacios
text = re.sub(r' ', ' ', text) # remover espacios
text = re.sub(r' ', ' ', text) # remover espacios
text = re.sub(r' ', ' ', text) # remover espacios
text = re.sub(r' ', ' ', text) # remover espacios
text = re.sub('\[*?¿\]\%', ' ',text) #remover caracteres especiales
#text = re.sub('\[.*?¿\]\%', ' ',text) retirando con punto
text = re.sub('\w*\d\w*', '',text) #remover números entre palabras
text = re.sub('[‘’“”…«»]', '',text)#remover caracteres adicionales
text = re.sub('\n', '',text) #remover newlines
return text
limpieza=lambda x:pre_process(x)
#pip install gensim
#Cargamos los datos
data=pd.read_excel("Artículos_inglés_3.xlsx")
data.head()#Mostrar los 5 primeros registros si no coloco un número
Title | Abstract | Keywords | Introduction | methodology | Conclusion | Aknowledge | |
---|---|---|---|---|---|---|---|
0 | A Comparative Analysis on Smart Farming Techni... | Agriculture is considered as one of the major... | s: Smart farming, Internet of Things (IoT), Io... | Internet of Things (IoT) is a term which was ... | ANALYSIS AND | Hardware Architecture There are many methodol... | We express our sincere gratitude to Ms. Kuppa... |
1 | A Decision Support System for Irrigation Manag... | : Automatic irrigation scheduling systems are ... | s: decision support systems; automatic irrigat... | Water is a limiting factor in agricultural pr... | ETo (reference evapotranspiration) is obtained... | In this section, we show the results obtained... | Contributions: Conceptualization, R.T.-S., A.... |
2 | A novel methodology for the development of an ... | The Internet of Things (IoT) plays a vital ro... | agriculture, Internet of Things, motor automa... | Economy of any country mainly deals with the ... | USING IOT In the proposed IoT irrigation syst... | AND DISCUSSIONS The proposed IoT model result... | NaN |
3 | A Smart Decision System for Digital Farming.txt | : New technologies have the potential to trans... | s: smart farming; IoT farming; agriculture sma... | Precision agriculture (PA) consists of managi... | The first step to develop our management platfo... | s and Future Work In this work, we have presen... | Contributions: C.C.B. designed the platform. ... |
4 | A Solar-Powered Fertigation System Based on Lo... | —Nowadays, the technological innovations affec... | s—precise farming, IoT devices, fertigation sy... | THE intensive agriculture allows to satisfy t... | Aim of this section is provide an overview ab... | In this work, we have reported on the design o... | (s). This is an open-access article distribute... |
#Conocer los datos cargados
data.info()#data es el nombre del dataframe
<class 'pandas.core.frame.DataFrame'> RangeIndex: 69 entries, 0 to 68 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Title 69 non-null object 1 Abstract 68 non-null object 2 Keywords 68 non-null object 3 Introduction 69 non-null object 4 methodology 69 non-null object 5 Conclusion 65 non-null object 6 Aknowledge 42 non-null object dtypes: object(7) memory usage: 3.9+ KB
type(data['Abstract'][22])
float
#Convertir de lista a string
#abstract=data.Abstract.to_string()
#data.fillna("nada")
data.fillna("",inplace=True)
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 69 entries, 0 to 68 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Title 69 non-null object 1 Abstract 69 non-null object 2 Keywords 69 non-null object 3 Introduction 69 non-null object 4 methodology 69 non-null object 5 Conclusion 69 non-null object 6 Aknowledge 69 non-null object dtypes: object(7) memory usage: 3.9+ KB
data.Title=data.Title.str.replace("internet of things", "iot")
data.Abstract=data.Abstract.str.replace("internet of things", "iot")
data.Keywords=data.Keywords.str.replace("internet of things", "iot")
data.methodology=data.methodology.str.replace("internet of things", "iot")
data.Conclusion=data.Conclusion.str.replace("internet of things", "iot")
data.Title=data.Title.str.replace("Internet of Things", "iot")
data.Abstract=data.Abstract.str.replace("Internet of Things", "iot")
data.Keywords=data.Keywords.str.replace("Internet of Things", "iot")
data.methodology=data.methodology.str.replace("Internet of Things", "iot")
data.Conclusion=data.Conclusion.str.replace("Internet of Things", "iot")
# Función de limpieza
def cleaner(word):
try:
word = re.sub(r'\#\.', '', word)
word = re.sub(r',', '', word) # Remover comas
word = re.sub(r'\-', ' ', word) # Remover guiones
word = re.sub(r'á', 'a', word) # remover tildes
word = re.sub(r'é', 'e', word) # remover tildes
word = re.sub(r'í', 'i', word) # remover tildes
word = re.sub(r'ó', 'o', word) # remover tildes
word = re.sub(r'ú', 'u', word) # remover tildes
word = re.sub('\[.*?¿\]\%', ' ',word) #remover caracteres especiales
word = re.sub('[%s]' % re.escape(string.punctuation), ' ',word) #remover puntuación
word = re.sub('\w*\d\w*', '',word) #remover números entre palabras
word = re.sub('[‘’“”…«»]', '',word)#remover caracteres adicionales
word = re.sub('\n', ' ',word) #remover newlines
list_word_clean = []
for w1 in word.split(" "):
if w1.lower() not in stopwords:
list_word_clean.append(w1.lower())
bigram_list = bigram[list_word_clean]
out_text = lemmatization(" ".join(bigram_list))
return out_text
except TypeError:
return np.nan
def lemmatization(texts, allowed_postags=['NOUN','ADJ','VERB','ADV']):
texts_out = [ token.text for token in nlp(texts) if token.pos_ in
allowed_postags and token.text not in black_list and len(token.text)>2]
return texts_out
len(data.Title.to_list())
69
#pip install spacy
Requirement already satisfied: spacy in c:\users\suzak\anaconda3\lib\site-packages (3.1.3) Requirement already satisfied: jinja2 in c:\users\suzak\anaconda3\lib\site-packages (from spacy) (2.11.2) Requirement already satisfied: packaging>=20.0 in c:\users\suzak\anaconda3\lib\site-packages (from spacy) (20.4) Requirement already satisfied: wasabi<1.1.0,>=0.8.1 in c:\users\suzak\anaconda3\lib\site-packages (from spacy) (0.8.2) Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.8 in c:\users\suzak\anaconda3\lib\site-packages (from spacy) (3.0.8) Requirement already satisfied: srsly<3.0.0,>=2.4.1 in c:\users\suzak\anaconda3\lib\site-packages (from spacy) (2.4.2) Requirement already satisfied: pathy>=0.3.5 in c:\users\suzak\anaconda3\lib\site-packages (from spacy) (0.6.0) Requirement already satisfied: requests<3.0.0,>=2.13.0 in c:\users\suzak\anaconda3\lib\site-packages (from spacy) (2.24.0) Requirement already satisfied: pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4 in c:\users\suzak\anaconda3\lib\site-packages (from spacy) (1.8.2) Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in c:\users\suzak\anaconda3\lib\site-packages (from spacy) (2.0.6) Requirement already satisfied: setuptools in c:\users\suzak\anaconda3\lib\site-packages (from spacy) (50.3.1.post20201107) Requirement already satisfied: cymem<2.1.0,>=2.0.2 in c:\users\suzak\anaconda3\lib\site-packages (from spacy) (2.0.6) Requirement already satisfied: typer<0.5.0,>=0.3.0 in c:\users\suzak\anaconda3\lib\site-packages (from spacy) (0.4.0) Requirement already satisfied: thinc<8.1.0,>=8.0.9 in c:\users\suzak\anaconda3\lib\site-packages (from spacy) (8.0.11) Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in c:\users\suzak\anaconda3\lib\site-packages (from spacy) (4.50.2) Requirement already satisfied: blis<0.8.0,>=0.4.0 in c:\users\suzak\anaconda3\lib\site-packages (from spacy) (0.7.5) Requirement already satisfied: preshed<3.1.0,>=3.0.2 in c:\users\suzak\anaconda3\lib\site-packages (from spacy) (3.0.6) Requirement already satisfied: numpy>=1.15.0 in c:\users\suzak\anaconda3\lib\site-packages (from spacy) (1.21.3) Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in c:\users\suzak\anaconda3\lib\site-packages (from spacy) (1.0.6) Requirement already satisfied: MarkupSafe>=0.23 in c:\users\suzak\anaconda3\lib\site-packages (from jinja2->spacy) (1.1.1) Requirement already satisfied: pyparsing>=2.0.2 in c:\users\suzak\anaconda3\lib\site-packages (from packaging>=20.0->spacy) (2.4.7) Requirement already satisfied: six in c:\users\suzak\anaconda3\lib\site-packages (from packaging>=20.0->spacy) (1.12.0) Requirement already satisfied: smart-open<6.0.0,>=5.0.0 in c:\users\suzak\anaconda3\lib\site-packages (from pathy>=0.3.5->spacy) (5.2.1) Requirement already satisfied: certifi>=2017.4.17 in c:\users\suzak\anaconda3\lib\site-packages (from requests<3.0.0,>=2.13.0->spacy) (2020.6.20) Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in c:\users\suzak\anaconda3\lib\site-packages (from requests<3.0.0,>=2.13.0->spacy) (1.25.11) Requirement already satisfied: chardet<4,>=3.0.2 in c:\users\suzak\anaconda3\lib\site-packages (from requests<3.0.0,>=2.13.0->spacy) (3.0.4)Note: you may need to restart the kernel to use updated packages. Requirement already satisfied: idna<3,>=2.5 in c:\users\suzak\anaconda3\lib\site-packages (from requests<3.0.0,>=2.13.0->spacy) (2.10) Requirement already satisfied: typing-extensions>=3.7.4.3 in c:\users\suzak\anaconda3\lib\site-packages (from pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4->spacy) (3.7.4.3) Requirement already satisfied: click<9.0.0,>=7.1.1 in c:\users\suzak\anaconda3\lib\site-packages (from typer<0.5.0,>=0.3.0->spacy) (7.1.2)
import nltk
nltk.download("popular") #Descargar configuración popular
[nltk_data] Downloading collection 'popular' [nltk_data] | [nltk_data] | Downloading package cmudict to [nltk_data] | C:\Users\suzak\AppData\Roaming\nltk_data... [nltk_data] | Package cmudict is already up-to-date! [nltk_data] | Downloading package gazetteers to [nltk_data] | C:\Users\suzak\AppData\Roaming\nltk_data... [nltk_data] | Package gazetteers is already up-to-date! [nltk_data] | Downloading package genesis to [nltk_data] | C:\Users\suzak\AppData\Roaming\nltk_data... [nltk_data] | Package genesis is already up-to-date! [nltk_data] | Downloading package gutenberg to [nltk_data] | C:\Users\suzak\AppData\Roaming\nltk_data... [nltk_data] | Package gutenberg is already up-to-date! [nltk_data] | Downloading package inaugural to [nltk_data] | C:\Users\suzak\AppData\Roaming\nltk_data... [nltk_data] | Package inaugural is already up-to-date! [nltk_data] | Downloading package movie_reviews to [nltk_data] | C:\Users\suzak\AppData\Roaming\nltk_data... [nltk_data] | Package movie_reviews is already up-to-date! [nltk_data] | Downloading package names to [nltk_data] | C:\Users\suzak\AppData\Roaming\nltk_data... [nltk_data] | Package names is already up-to-date! [nltk_data] | Downloading package shakespeare to [nltk_data] | C:\Users\suzak\AppData\Roaming\nltk_data... [nltk_data] | Package shakespeare is already up-to-date! [nltk_data] | Downloading package stopwords to [nltk_data] | C:\Users\suzak\AppData\Roaming\nltk_data... [nltk_data] | Package stopwords is already up-to-date! [nltk_data] | Downloading package treebank to [nltk_data] | C:\Users\suzak\AppData\Roaming\nltk_data... [nltk_data] | Package treebank is already up-to-date! [nltk_data] | Downloading package twitter_samples to [nltk_data] | C:\Users\suzak\AppData\Roaming\nltk_data... [nltk_data] | Package twitter_samples is already up-to-date! [nltk_data] | Downloading package omw to [nltk_data] | C:\Users\suzak\AppData\Roaming\nltk_data... [nltk_data] | Package omw is already up-to-date! [nltk_data] | Downloading package wordnet to [nltk_data] | C:\Users\suzak\AppData\Roaming\nltk_data... [nltk_data] | Package wordnet is already up-to-date! [nltk_data] | Downloading package wordnet31 to [nltk_data] | C:\Users\suzak\AppData\Roaming\nltk_data... [nltk_data] | Package wordnet31 is already up-to-date! [nltk_data] | Downloading package wordnet_ic to [nltk_data] | C:\Users\suzak\AppData\Roaming\nltk_data... [nltk_data] | Package wordnet_ic is already up-to-date! [nltk_data] | Downloading package words to [nltk_data] | C:\Users\suzak\AppData\Roaming\nltk_data... [nltk_data] | Package words is already up-to-date! [nltk_data] | Downloading package maxent_ne_chunker to [nltk_data] | C:\Users\suzak\AppData\Roaming\nltk_data... [nltk_data] | Package maxent_ne_chunker is already up-to-date! [nltk_data] | Downloading package punkt to [nltk_data] | C:\Users\suzak\AppData\Roaming\nltk_data... [nltk_data] | Package punkt is already up-to-date! [nltk_data] | Downloading package snowball_data to [nltk_data] | C:\Users\suzak\AppData\Roaming\nltk_data... [nltk_data] | Package snowball_data is already up-to-date! [nltk_data] | Downloading package averaged_perceptron_tagger to [nltk_data] | C:\Users\suzak\AppData\Roaming\nltk_data... [nltk_data] | Package averaged_perceptron_tagger is already up- [nltk_data] | to-date! [nltk_data] | [nltk_data] Done downloading collection popular
True
def tokenizar(texto):
tokens = word_tokenize(texto)
words = [w.lower() for w in tokens if w.isalnum()]
return words
import pandas as pd
import numpy as np
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import stopwords
import string
import matplotlib.pyplot as plt
import gensim
import numpy as np
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
#from gensim.models.wrappers import LdaMallet
from gensim.corpora import Dictionary
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import stopwords
import string
import matplotlib.pyplot as plt
import gensim
from gensim.corpora import Dictionary
from spacy.lang.en import English
nlp=English()
bigram=gensim.models.Phrases(data.methodology.to_list())
bigram[data.Title.to_list()[68].split()]
['Wireless', 'Sensor', 'Network', 'and', 'iot.txt']
wabstract=bigram[data.Abstract.to_list()[1].split()]
wmethod=bigram[data.methodology.to_list()[68].split()]
wconclusion=bigram[data.Conclusion.to_list()[68].split()]
wkey=bigram[data.Keywords.to_list()[68].split()]
stop_words = stopwords.words('english')
wabstract = [w for w in wabstract if not w in stop_words and w.isalpha()]
wmethod = [w for w in wmethod if not w in stop_words and w.isalpha()]
data.shape
(69, 7)
#Convertir a minúsculas, limpiar y tokenizar
from nltk.tokenize import word_tokenize
def tokenizar(texto):
tokens = word_tokenize(texto)
words = [w.lower() for w in tokens if w.isalnum()]
return words
#data['tokens'] = data['Llamada'].apply(lambda x: tokenizar(x))
data["tokens_abstract"] = data["Abstract"].apply(lambda x: tokenizar(x))
data.head()
Title | Abstract | Keywords | Introduction | methodology | Conclusion | Aknowledge | tokens_abstract | |
---|---|---|---|---|---|---|---|---|
0 | A Comparative Analysis on Smart Farming Techni... | Agriculture is considered as one of the major... | s: Smart farming, iot (IoT), IoT in agricultur... | Internet of Things (IoT) is a term which was ... | ANALYSIS AND | Hardware Architecture There are many methodol... | We express our sincere gratitude to Ms. Kuppa... | [agriculture, is, considered, as, one, of, the... |
1 | A Decision Support System for Irrigation Manag... | : Automatic irrigation scheduling systems are ... | s: decision support systems; automatic irrigat... | Water is a limiting factor in agricultural pr... | ETo (reference evapotranspiration) is obtained... | In this section, we show the results obtained... | Contributions: Conceptualization, R.T.-S., A.... | [automatic, irrigation, scheduling, systems, a... |
2 | A novel methodology for the development of an ... | The iot (IoT) plays a vital role in the entit... | agriculture, iot, motor automation, soil para... | Economy of any country mainly deals with the ... | USING IOT In the proposed IoT irrigation syst... | AND DISCUSSIONS The proposed IoT model result... | [the, iot, iot, plays, a, vital, role, in, the... | |
3 | A Smart Decision System for Digital Farming.txt | : New technologies have the potential to trans... | s: smart farming; IoT farming; agriculture sma... | Precision agriculture (PA) consists of managi... | The first step to develop our management platfo... | s and Future Work In this work, we have presen... | Contributions: C.C.B. designed the platform. ... | [new, technologies, have, the, potential, to, ... |
4 | A Solar-Powered Fertigation System Based on Lo... | —Nowadays, the technological innovations affec... | s—precise farming, IoT devices, fertigation sy... | THE intensive agriculture allows to satisfy t... | Aim of this section is provide an overview ab... | In this work, we have reported on the design o... | (s). This is an open-access article distribute... | [the, technological, innovations, affect, all,... |
def try_join(l):
try:
return ','.join(map(str, l))
except TypeError:
return np.nan
abstract=try_join(data.Abstract)
title=try_join(data.Title)
keywords=try_join(data.Keywords)
methodology=try_join(data.methodology)
conclusion=try_join(data.Conclusion)
abstract=pre_process(abstract)
title=pre_process(title)
keywords=pre_process(keywords)
conclusion=pre_process(conclusion)
abstract_txt=nlp(abstract)
tokens_abstract=[token.text for token in abstract_txt]
#Eliminamos stopwords
from nltk.corpus import stopwords
sw= stopwords.words('english')
def limpiar_stopwords(lista):
clean_tokens = lista[:]
for token in lista:
if token in sw:
clean_tokens.remove(token)
return clean_tokens
# Limpiamos los tokens
#data['sin_stopwords'] = data['tokens'].apply(lambda x: limpiar_stopwords(x))
#data.head()
abstract_clean=limpiar_stopwords(tokens_abstract)
len(abstract_clean)
8617
# Limpiamos los tokens
data['Abstract_sin_stopwords'] = data['tokens_abstract'].apply(lambda x: limpiar_stopwords(x))
data.head()
Title | Abstract | Keywords | Introduction | methodology | Conclusion | Aknowledge | tokens_abstract | Abstract_sin_stopwords | |
---|---|---|---|---|---|---|---|---|---|
0 | A Comparative Analysis on Smart Farming Techni... | Agriculture is considered as one of the major... | s: Smart farming, iot (IoT), IoT in agricultur... | Internet of Things (IoT) is a term which was ... | ANALYSIS AND | Hardware Architecture There are many methodol... | We express our sincere gratitude to Ms. Kuppa... | [agriculture, is, considered, as, one, of, the... | [agriculture, considered, one, major, sources,... |
1 | A Decision Support System for Irrigation Manag... | : Automatic irrigation scheduling systems are ... | s: decision support systems; automatic irrigat... | Water is a limiting factor in agricultural pr... | ETo (reference evapotranspiration) is obtained... | In this section, we show the results obtained... | Contributions: Conceptualization, R.T.-S., A.... | [automatic, irrigation, scheduling, systems, a... | [automatic, irrigation, scheduling, systems, h... |
2 | A novel methodology for the development of an ... | The iot (IoT) plays a vital role in the entit... | agriculture, iot, motor automation, soil para... | Economy of any country mainly deals with the ... | USING IOT In the proposed IoT irrigation syst... | AND DISCUSSIONS The proposed IoT model result... | [the, iot, iot, plays, a, vital, role, in, the... | [iot, iot, plays, vital, role, entity, sharing... | |
3 | A Smart Decision System for Digital Farming.txt | : New technologies have the potential to trans... | s: smart farming; IoT farming; agriculture sma... | Precision agriculture (PA) consists of managi... | The first step to develop our management platfo... | s and Future Work In this work, we have presen... | Contributions: C.C.B. designed the platform. ... | [new, technologies, have, the, potential, to, ... | [new, technologies, potential, transform, agri... |
4 | A Solar-Powered Fertigation System Based on Lo... | —Nowadays, the technological innovations affec... | s—precise farming, IoT devices, fertigation sy... | THE intensive agriculture allows to satisfy t... | Aim of this section is provide an overview ab... | In this work, we have reported on the design o... | (s). This is an open-access article distribute... | [the, technological, innovations, affect, all,... | [technological, innovations, affect, human, ac... |
# Segunda limpieza, suando stop-words de sklearn
from sklearn.feature_extraction import text
stop = text.ENGLISH_STOP_WORDS
def limpiar_stopwords2(lista):
clean_tokens = lista[:]
for token in lista:
if token in stop:
clean_tokens.remove(token)
return clean_tokens
abstract_clean=limpiar_stopwords2(abstract_clean)
len(abstract_clean)
8094
data['Abstract_sin_stopwords2'] = data['Abstract_sin_stopwords'].apply(lambda x: limpiar_stopwords(x))
data.sample(5)
Title | Abstract | Keywords | Introduction | methodology | Conclusion | Aknowledge | tokens_abstract | Abstract_sin_stopwords | Abstract_sin_stopwords2 | |
---|---|---|---|---|---|---|---|---|---|---|
30 | Inference System for Irrigation Scheduling wit... | . The design and implementation of an inferenc... | s: Artificial intelligence • Fuzzy logic • Irr... | Precision agriculture (PA) is defined as a fa... | This work was developed at the USOCHICAMOCHA ... | Outcomes of the inference system are specifie... | s Inference System for Irrigation Scheduling w... | [the, design, and, implementation, of, an, inf... | [design, implementation, inference, method, sc... | [design, implementation, inference, method, sc... |
59 | sCrop. A Novel Device for Sustainable Automati... | —Agriculture Cyber-Physical System (A-CPS) is ... | —Smart agriculture, agriculture cyber-physical... | OOD is one of the quintessential assets in li... | Existing works in smart agriculture works are... | and Future Works In this paper, a solar enabl... | ized licensed use limited to: Universidad Indu... | [system, is, becoming, increasingly, important... | [system, becoming, increasingly, important, en... | [system, becoming, increasingly, important, en... |
18 | Assessment_of_Sustainable_Agriculture_Practice... | —About 65% of adults across the world, engage ... | —Live-in-Labs®, Agriculture, Sustainable Devel... | The World Bank (WB) strongly stressed on the ... | In order to address any challenge, it is impor... | Observations in this village are in line with... | ized licensed use limited to: UNIVERSIDAD AUTO... | [65, of, adults, across, the, world, engage, i... | [65, adults, across, world, engage, agricultur... | [65, adults, across, world, engage, agricultur... |
26 | Development of a closed-loop irrigation system... | : Irrigation management Sugarcane farming iot\... | s: 1.\t | Water is a highly valued commodity by multipl... | , Software, Writing - original draft. S. Attar... | In this section, we will present the simulati... | acquisition, Project administration. Declarat... | [irrigation, management, sugarcane, farming, i... | [irrigation, management, sugarcane, farming, i... | [irrigation, management, sugarcane, farming, i... |
21 | CareBro_Personal_Farm_AssistantAn_IoT_based_Sm... | —Post Covid-19 era redefines farming in terms ... | s—IoT,EdgeComputing, Thingspeak Cloud, Arduino... | The idea behind making CareBro bloomed after ... | A. System Architecture The infrastructure for... | and future scope After analysing and explorin... | ized licensed use limited to: UNIVERSIDAD AUTO... | [era, redefines, farming, in, terms, of, ensur... | [era, redefines, farming, terms, ensuring, max... | [era, redefines, farming, terms, ensuring, max... |
type(data)
pandas.core.frame.DataFrame
#pip install wordcloud
abstract_clean
[' ', 'agriculture', 'considered', 'major', 'sources', 'maintaining', 'nation', 'gdp', 'developing', 'countries', 'developed', 'countries', 'relying', 'cultivation', 'improve', 'economic', 'wealth', 'modern', 'technology', 'era', 'technology', 'play', 'tremendous', 'role', 'agriculture', 'sector', 'advanced', 'technology', 'capability', 'automate', 'various', 'cultivation', 'phases', 'like', 'watering', 'fertilizing', 'harvesting', 'order', 'make', 'cultivation', 'phases', 'smarter', 'deploy', 'smart', 'sensors', 'fields', 'sense', 'water', 'level', 'photo', 'sensors', 'ensure', 'sufficient', 'sunlight', 'available', 'plant', 'growth', 'sensors', 'sense', 'nitrogen', 'content', 'inform', 'farmer', 'initiate', 'steps', 'proper', 'fertilizing', 'works', 'area', 'progressing', 'labs', 'analyzed', 'various', 'standard', 'iot', 'techniques', 'used', 'agriculture', 'sector', 'based', 'hardware', 'software', 'deriving', 'existing', 'challenges', 'making', 'farming', 'smarter', 'efficient', 'automatic', 'irrigation', 'scheduling', 'systems', 'highly', 'demanded', 'agricultural', 'sector', 'ability', 'save', 'water', 'manage', 'deficit', 'irrigation', 'strategies', 'elaborating', 'functional', 'efficient', 'automatic', 'irrigation', 'complex', 'task', 'high', 'number', 'factors', 'technician', 'considers', 'managing', 'irrigation', 'optimal', 'way', 'automatic', 'learning', 'systems', 'propose', 'alternative', 'traditional', 'irrigation', 'management', 'means', 'automatic', 'elaboration', 'predictions', 'based', 'learning', 'agronomist', 'dss', 'aim', 'paper', 'study', 'learning', 'techniques', 'order', 'determine', 'goodness', 'error', 'relative', 'expert', 'decision', 'orchards', 'tested', 'using', 'linear', 'regression', 'lr', 'random', 'forest', 'regression', 'rfr', 'support', 'vector', 'regression', 'svr', 'methods', 'engines', 'irrigation', 'decision', 'support', 'idss', 'proposed', 'results', 'obtained', 'learning', 'methods', 'orchards', 'compared', 'decisions', 'agronomist', 'entire', 'year', 'prediction', 'model', 'errors', 'determined', 'best', 'fitting', 'regression', 'model', 'results', 'obtained', 'lead', 'conclusion', 'methods', 'valid', 'engines', 'develop', 'automatic', 'irrigation', 'scheduling', 'systems', 'iot', 'iot', 'plays', 'vital', 'role', 'entity', 'sharing', 'minimizing', 'workload', 'human', 'beings', 'various', 'aspects', 'nowadays', 'term', 'iot', 'used', 'various', 'fields', 'health', 'care', 'auto', 'mobiles', 'industry', 'agriculture', 'agriculture', 'main', 'source', 'food', 'world', 'var', 'ious', 'problems', 'faced', 'farmers', 'agriculture', 'shortage', 'wastage', 'water', 'fertilizers', 'regard', 'optimal', 'iot', 'model', 'developed', 'proposed', 'attain', 'effective', 'crop', 'field', 'proposed', 'iot', 'model', 'monitor', 'record', 'temperature', 'soil', 'moisture', 'values', 'continuously', 'analyzed', 'achieve', 'optimal', 'plant', 'growth', 'yield', 'motor', 'connected', 'iot', 'model', 'automati', 'cally', 'switch', 'based', 'optimal', 'threshold', 'tempera', 'ture', 'soil', 'moisture', 'content', 'value', 'novel', 'irrigation', 'algorithm', 'named', 'differential', 'waterflow', 'algorithm', 'proposed', 'deployed', 'proposed', 'iot', 'model', 'automatic', 'usage', 'motor', 'field', 'proposed', 'iot', 'model', 'provides', 'web', 'interface', 'user', 'cloud', 'storage', 'farmer', 'control', 'monitor', 'remote', 'proposed', 'sys', 'tem', 'reduce', 'water', 'consumption', 'ensure', 'computational', 'intelligence', 'wileyonlinelibrary', 'com', 'journal', 'coin', 'wiley', 'periodicals', 'wiley', 'manikandanet', 'al', 'uniform', 'water', 'distribution', 'crops', 'poisson', 'distribution', 'results', 'increasing', 'yield', 'new', 'technologies', 'potential', 'transform', 'agriculture', 'reduce', 'environmental', 'impact', 'green', 'revolution', 'iot', 'iot', 'based', 'application', 'development', 'platforms', 'potential', 'run', 'farm', 'management', 'tools', 'capable', 'monitoring', 'real', 'time', 'events', 'integrated', 'interactive', 'innovation', 'models', 'fertirrigation', 'capabilities', 'extend', 'flexible', 'reconfiguration', 'programmed', 'actions', 'iot', 'platforms', 'require', 'complex', 'smart', 'decision', 'making', 'systems', 'based', 'data', 'analysis', 'data', 'mining', 'big', 'data', 'sets', 'paper', 'advantages', 'demonstrated', 'powerful', 'tool', 'applies', 'real', 'time', 'decisions', 'data', 'variable', 'rate', 'irrigation', 'selected', 'parameters', 'field', 'weather', 'conditions', 'field', 'parameters', 'index', 'vegetation', 'estimated', 'using', 'aerial', 'images', 'irrigation', 'events', 'flow', 'level', 'pressure', 'level', 'wind', 'speed', 'periodically', 'sampled', 'data', 'processed', 'decision', 'making', 'based', 'learning', 'prediction', 'rules', 'conjunction', 'drools', 'rule', 'engine', 'multimedia', 'platform', 'remotely', 'controlled', 'offers', 'smart', 'farming', 'open', 'data', 'network', 'shared', 'restriction', 'levels', 'information', 'exchange', 'oriented', 'farmers', 'fertilizer', 'provider', 'agricultural', 'technicians', 'provide', 'farmer', 'added', 'value', 'form', 'better', 'decision', 'making', 'efficient', 'exploitation', 'operations', 'management', 'nowadays', 'technological', 'innovations', 'affect', 'human', 'activities', 'agriculture', 'field', 'heavily', 'benefits', 'technologies', 'informatics', 'electronic', 'telecommunication', 'allowing', 'huge', 'improvements', 'productivity', 'resources', 'exploitation', 'manuscript', 'presents', 'innovative', 'low', 'cost', 'fertigation', 'assisting', 'cultures', 'using', 'data', 'processing', 'electronic', 'boards', 'wireless', 'sensors', 'network', 'wsn', 'connected', 'remote', 'software', 'platform', 'proposed', 'receives', 'information', 'related', 'air', 'soil', 'parameters', 'custom', 'solar', 'powered', 'wsn', 'control', 'unit', 'elaborates', 'acquired', 'data', 'using', 'dynamic', 'agronomic', 'models', 'implemented', 'cloud', 'platform', 'optimizing', 'typology', 'fertilizers', 'irrigations', 'frequency', 'function', 'weather', 'forecasts', 'got', 'line', 'weather', 'service', 'water', 'scarcity', 'desertification', 'considered', 'greatest', 'challenges', 'humanity', 'coming', 'decades', 'worldwide', 'agriculture', 'accounts', 'total', 'water', 'usage', 'industry', 'accounts', 'urban', 'use', 'greece', 'rural', 'development', 'model', 'poor', 'farming', 'practices', 'resulted', 'overwhelming', 'total', 'water', 'sumption', 'directed', 'farming', 'uses', 'furthermore', 'excessive', 'use', 'water', 'agriculture', 'combined', 'existing', 'pesticides', 'fertilizers', 'usage', 'levels', 'creates', 'exponential', 'problems', 'water', 'cycle', 'greece', 'taking', 'account', 'challenges', 'project', 'aims', 'exploit', 'state', 'art', 'technologies', 'particular', 'emerging', 'developments', 'field', 'iot', 'iot', 'means', 'promote', 'rational', 'use', 'water', 'resources', 'agriculture', 'particular', 'project', 'aims', 'accelerating', 'penetration', 'low', 'power', 'wide', 'access', 'lpwa', 'technologies', 'series', 'research', 'innovation', 'actions', 'focusing', 'design', 'development', 'operation', 'commercial', 'exploitation', 'relevant', 'hardware', 'software', 'iot', 'applications', 'preliminary', 'results', 'project', 'depicted', 'future', 'remarks', 'outlined', 'center', 'pivot', 'systems', 'widely', 'used', 'overcome', 'irrigation', 'needs', 'agricultural', 'fields', 'paper', 'autonomous', 'approach', 'proposed', 'order', 'improve', 'low', 'efficiency', 'irrigation', 'developing', 'based', 'water', 'requirement', 'plantations', 'field', 'data', 'data', 'local', 'temperature', 'local', 'wind', 'soil', 'moisture', 'precipitation', 'forecast', 'soil', 'evapotranspiration', 'calculation', 'information', 'enables', 'calculate', 'real', 'evapotranspiration', 'necessary', 'restrict', 'lysimetric', 'measures', 'way', 'schedules', 'irrigation', 'lower', 'cost', 'periods', 'considering', 'produced', 'energy', 'local', 'resources', 'price', 'energy', 'purchased', 'utility', 'grid', 'considered', 'irrigation', 'carried', 'time', 'interval', 'plantations', 'reach', 'wilding', 'point', 'carried', 'periods', 'lowest', 'cost', 'optimize', 'overall', 'operational', 'costs', 'irrigation', 'published', 'elsevier', 'open', 'access', 'article', 'cc', 'nc', 'nd', 'license', 'http', 'creativecommons', 'org', 'licenses', 'nc', 'nd', 'peer', 'review', 'responsibility', 'scientific', 'committee', 'international', 'conference', 'energy', 'environment', 'research', 'iceer', 'agriculture', 'wireless', 'sensor', 'networks', 'lora', 'prediction', 'arima', 'anticipation', 'problems', 'tone', 'significant', 'challenges', 'future', 'guarantee', 'food', 'inhabitants', 'planet', 'alternatives', 'issue', 'consists', 'increasing', 'production', 'accomplish', 'necessary', 'novative', 'options', 'applied', 'enhance', 'soil', 'capacity', 'protection', 'environmental', 'resources', 'context', 'iot', 'iot', 'gaining', 'attention', 'lot', 'alternatives', 'aid', 'farmers', 'smart', 'sensors', 'visualization', 'systems', 'state', 'art', 'presents', 'options', 'iot', 'applications', 'rural', 'environment', 'assist', 'agricultural', 'producer', 'decision', 'making', 'act', 'anticipate', 'problems', 'crops', 'article', 'presents', 'model', 'named', 'agriprediction', 'combines', 'short', 'medium', 'wireless', 'network', 'range', 'prediction', 'engine', 'anticipate', 'potential', 'crop', 'dys', 'functions', 'proactively', 'notifying', 'farmer', 'remedial', 'actions', 'soon', 'possible', 'achieve', 'agriprediction', 'presents', 'framework', 'components', 'based', 'lora', 'iot', 'technology', 'arima', 'prediction', 'model', 'results', 'demonstrated', 'feasibility', 'using', 'lora', 'rural', 'places', 'providing', 'advantages', 'prediction', 'observe', 'troubles', 'related', 'soil', 'humidity', 'temperature', 'particular', 'using', 'agriprediction', 'arugula', 'cultivation', 'gains', 'obtained', 'concerning', 'leaf', 'development', 'terms', 'weight', 'comparison', 'standard', 'cultivation', 'procedure', 'article', 'present', 'design', 'internet', 'things', 'iot', 'based', 'dynamic', 'irrigation', 'scheduling', 'agrisens', 'efficient', 'water', 'management', 'irrigated', 'crop', 'fields', 'agrisens', 'provides', 'real', 'time', 'automatic', 'dynamic', 'remote', 'manual', 'irrigation', 'treatment', 'different', 'growth', 'phases', 'crop', 'life', 'cycle', 'using', 'iot', 'low', 'cost', 'water', 'level', 'sensor', 'designed', 'measure', 'level', 'water', 'present', 'field', 'pro', 'pose', 'algorithm', 'automatic', 'dynamic', 'cum', 'manual', 'irrigation', 'based', 'farmer', 'requirements', 'agrisens', 'farmer', 'friendly', 'user', 'interface', 'provides', 'field', 'information', 'farmers', 'multimodal', 'manner', 'visual', 'display', 'cell', 'phone', 'web', 'portal', 'achieves', 'significant', 'results', 'respect', 'differ', 'ent', 'performance', 'metrics', 'data', 'validation', 'packet', 'delivery', 'ratio', 'energy', 'consumption', 'failure', 'rate', 'various', 'climatic', 'conditions', 'dynamic', 'irrigation', 'treatments', 'experimental', 'results', 'agrisens', 'helps', 'improve', 'crop', 'produc', 'tivity', 'traditional', 'manual', 'irrigation', ...]
#Nube de palabras
from wordcloud import WordCloud
#lista_palabras = data["stemming"].tolist()
#tokens = [keyword.strip() for sublista in abstract_clean for keyword in sublista]
texto= ' '.join(abstract_clean)
wc = WordCloud(background_color="white", max_words=1000, margin=0)
wc.generate(texto)
wc.to_file("nube1.png")
plt.figure(figsize=(15,15))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()
#data.Abstract.iloc[0]=pre_process(data.Abstract.iloc[0])
#data.Abstract.iloc[1]=pre_process(data.Abstract.iloc[1])
data=data.drop(data[data["Abstract"]==""].index)
data.shape
(68, 10)
type(data['Abstract_sin_stopwords2'])
pandas.core.series.Series
#pip install plotly
abstract = pd.DataFrame(abstract_clean, columns = ['Abstract Limpio'])
abstract
Abstract Limpio | |
---|---|
0 | |
1 | agriculture |
2 | considered |
3 | major |
4 | sources |
... | ... |
8089 | farm |
8090 | cultivation |
8091 | harvesting |
8092 | irrigation |
8093 | fertilization |
8094 rows × 1 columns
#pd.value_counts(data['Abstract_sin_stopwords2'].plot(kind='bar'))
#Representación en vector de características tf*idf
#tf: Frecuencia de las palabras
#idf: Calificación que le damos a cada palabra
from sklearn.feature_extraction.text import TfidfVectorizer
def dummy_fun(doc):
return doc
tfidf=TfidfVectorizer(
analyzer="word",
tokenizer=dummy_fun,
preprocessor=dummy_fun,
token_pattern=None
)
X=tfidf.fit_transform(data['Abstract_sin_stopwords2'])
data_tfidf=pd.DataFrame(X.todense(),columns=tfidf.get_feature_names())
data_tfidf
0 | 1 | 10 | 100 | 12 | 13 | 14 | 15 | 17 | 18 | ... | xbee | year | years | yet | yield | yielding | yields | youtube | zambia | zucchini | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.0000 | 0.0 | 0.0 |
1 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.073689 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.0000 | 0.0 | 0.0 |
2 | 0.0 | 0.093098 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.000000 | 0.0 | 0.0 | 0.120175 | 0.0 | 0.000000 | 0.0000 | 0.0 | 0.0 |
3 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.0000 | 0.0 | 0.0 |
4 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.0000 | 0.0 | 0.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
63 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.0000 | 0.0 | 0.0 |
64 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.000000 | 0.0 | 0.0 | 0.032094 | 0.0 | 0.043583 | 0.0000 | 0.0 | 0.0 |
65 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.000000 | 0.0 | 0.0 | 0.046847 | 0.0 | 0.063618 | 0.0797 | 0.0 | 0.0 |
66 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.0000 | 0.0 | 0.0 |
67 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.0000 | 0.0 | 0.0 |
68 rows × 2395 columns
#Gráfica de palabras mas frecuentes del texto
freq = nltk.FreqDist(abstract_clean)
plt.figure(figsize=(16, 8))
freq.plot(50, cumulative=False)
<AxesSubplot:xlabel='Samples', ylabel='Counts'>
#Método del codo para encontrar la mejor opción
from sklearn.cluster import KMeans
#ks: Definiendo cuántos clústeres quiero probar
ks=range(5,68)#crear valores del 5 al 68
inertias=[]
for k in ks:
#Crear modelo
model=KMeans(n_clusters=k)
model.fit(data_tfidf)
inertias.append(model.inertia_)
# Graficar cantidad de clusters vs inertias
plt.plot(ks, inertias, '-o')
plt.xlabel('Numero de clusters, k')
plt.ylabel('inertia')
plt.xticks(ks)
([<matplotlib.axis.XTick at 0x1f0fed1b5e0>, <matplotlib.axis.XTick at 0x1f0fed1b5b0>, <matplotlib.axis.XTick at 0x1f0fed19130>, <matplotlib.axis.XTick at 0x1f0fed45340>, <matplotlib.axis.XTick at 0x1f0fed45850>, <matplotlib.axis.XTick at 0x1f0fed45d60>, <matplotlib.axis.XTick at 0x1f0fed4d2b0>, <matplotlib.axis.XTick at 0x1f0fed4d7c0>, <matplotlib.axis.XTick at 0x1f0fed4dcd0>, <matplotlib.axis.XTick at 0x1f0fed53220>, <matplotlib.axis.XTick at 0x1f0fed4da00>, <matplotlib.axis.XTick at 0x1f0fed45a90>, <matplotlib.axis.XTick at 0x1f0fed53640>, <matplotlib.axis.XTick at 0x1f0fed53a90>, <matplotlib.axis.XTick at 0x1f0fed53fa0>, <matplotlib.axis.XTick at 0x1f0fed594f0>, <matplotlib.axis.XTick at 0x1f0fed59a00>, <matplotlib.axis.XTick at 0x1f0fed59f10>, <matplotlib.axis.XTick at 0x1f0fed5f460>, <matplotlib.axis.XTick at 0x1f0fed5f970>, <matplotlib.axis.XTick at 0x1f0fed59100>, <matplotlib.axis.XTick at 0x1f0fed45910>, <matplotlib.axis.XTick at 0x1f0fed5f160>, <matplotlib.axis.XTick at 0x1f0fed5fe80>, <matplotlib.axis.XTick at 0x1f0fed673d0>, <matplotlib.axis.XTick at 0x1f0fed678e0>, <matplotlib.axis.XTick at 0x1f0fed67df0>, <matplotlib.axis.XTick at 0x1f0fed6b340>, <matplotlib.axis.XTick at 0x1f0fed6b850>, <matplotlib.axis.XTick at 0x1f0fed675b0>, <matplotlib.axis.XTick at 0x1f0fed4d490>, <matplotlib.axis.XTick at 0x1f0fed6b8b0>, <matplotlib.axis.XTick at 0x1f0fed6bdf0>, <matplotlib.axis.XTick at 0x1f0fed75340>, <matplotlib.axis.XTick at 0x1f0fed75850>, <matplotlib.axis.XTick at 0x1f0fed75d60>, <matplotlib.axis.XTick at 0x1f0fed7b2b0>, <matplotlib.axis.XTick at 0x1f0fed7b7c0>, <matplotlib.axis.XTick at 0x1f0fed7bcd0>, <matplotlib.axis.XTick at 0x1f0fed753d0>, <matplotlib.axis.XTick at 0x1f0fed4d3d0>, <matplotlib.axis.XTick at 0x1f0fed7b3d0>, <matplotlib.axis.XTick at 0x1f0fed83220>, <matplotlib.axis.XTick at 0x1f0fed83730>, <matplotlib.axis.XTick at 0x1f0fed83c40>, <matplotlib.axis.XTick at 0x1f0fed88190>, <matplotlib.axis.XTick at 0x1f0fed886a0>, <matplotlib.axis.XTick at 0x1f0fed88bb0>, <matplotlib.axis.XTick at 0x1f0fed837c0>, <matplotlib.axis.XTick at 0x1f0fed67490>, <matplotlib.axis.XTick at 0x1f0fed888b0>, <matplotlib.axis.XTick at 0x1f0ff1501f0>, <matplotlib.axis.XTick at 0x1f0ff150700>, <matplotlib.axis.XTick at 0x1f0ff150c10>, <matplotlib.axis.XTick at 0x1f0ff155160>, <matplotlib.axis.XTick at 0x1f0ff155670>, <matplotlib.axis.XTick at 0x1f0ff155b80>, <matplotlib.axis.XTick at 0x1f0ff15c0d0>, <matplotlib.axis.XTick at 0x1f0ff1551f0>, <matplotlib.axis.XTick at 0x1f0ff150280>, <matplotlib.axis.XTick at 0x1f0fed83400>, <matplotlib.axis.XTick at 0x1f0ff15c5e0>, <matplotlib.axis.XTick at 0x1f0ff15caf0>], [Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, '')])
#pip install yellowbrick
# Elbow Method for K means# Import ElbowVisualizer
from yellowbrick.cluster import KElbowVisualizer
model = KMeans()
# k is range of number of clusters.
visualizer = KElbowVisualizer(model, k=(1,68), timings= True)
visualizer.fit(data_tfidf) # Fit data to visualizer
visualizer.show() # Finalize and render figure
<AxesSubplot:title={'center':'Distortion Score Elbow for KMeans Clustering'}, xlabel='k', ylabel='distortion score'>
from sklearn.datasets import make_blobs
# Instantiate the clustering model and visualizer
model = KMeans()
visualizer = KElbowVisualizer(model, k=(5,68), metric='calinski_harabasz', timings=False)
visualizer.fit(data_tfidf) # Fit data to visualizer
visualizer.show() # Finalize and render figure
<AxesSubplot:title={'center':'Calinski Harabasz Score Elbow for KMeans Clustering'}, xlabel='k', ylabel='calinski harabasz score'>
from sklearn.cluster import KMeans
model = KMeans(n_clusters=25, max_iter=100)
model.fit(data_tfidf)
#Inercia
print(model.inertia_)
print(model.max_iter)
0.0 300
#Centroides de los clusters
#Creando dataframe con los centroides
#Centroides de los clusters
centroides=pd.DataFrame(model.cluster_centers_, columns=tfidf.get_feature_names())
centroides.round(3)# Redondeo a tres decimales
0 | 1 | 10 | 100 | 12 | 13 | 14 | 15 | 17 | 18 | ... | xbee | year | years | yet | yield | yielding | yields | youtube | zambia | zucchini | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | 0.0 | 0.00 | 0.00 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.074 | 0.000 | 0.000 | 0.000 | 0.0 | 0.000 | 0.0 | 0.0 | 0.000 |
1 | 0.0 | 0.0 | 0.00 | 0.00 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.000 | 0.000 | 0.000 | 0.000 | 0.0 | 0.000 | 0.0 | 0.0 | 0.000 |
2 | 0.0 | 0.0 | 0.04 | 0.04 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.030 | 0.000 | 0.000 | 0.000 | 0.0 | 0.064 | 0.0 | 0.0 | 0.000 |
3 | 0.0 | 0.0 | 0.00 | 0.00 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.000 | 0.000 | 0.093 | 0.069 | 0.0 | 0.000 | 0.0 | 0.0 | 0.000 |
4 | 0.0 | 0.0 | 0.00 | 0.00 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.000 | 0.082 | 0.000 | 0.000 | 0.0 | 0.000 | 0.0 | 0.0 | 0.123 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
62 | 0.0 | 0.0 | 0.00 | 0.00 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.000 | 0.077 | 0.000 | 0.000 | 0.0 | 0.000 | 0.0 | 0.0 | 0.000 |
63 | 0.0 | 0.0 | 0.00 | 0.00 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.000 | 0.000 | 0.000 | 0.000 | 0.0 | 0.000 | 0.0 | 0.0 | 0.000 |
64 | 0.0 | 0.0 | 0.00 | 0.00 | 0.0 | 0.07 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.058 | 0.000 | 0.000 | 0.000 | 0.0 | 0.000 | 0.0 | 0.0 | 0.000 |
65 | 0.0 | 0.0 | 0.00 | 0.00 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.000 | 0.000 | 0.000 | 0.000 | 0.0 | 0.000 | 0.0 | 0.0 | 0.000 |
66 | 0.0 | 0.0 | 0.00 | 0.00 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.000 | 0.000 | 0.000 | 0.000 | 0.0 | 0.000 | 0.0 | 0.0 | 0.000 |
67 rows × 2395 columns
#Cluster asignado a cada registro
data['cluster']=model.labels_
data.head()
Title | Abstract | Keywords | Introduction | methodology | Conclusion | Aknowledge | tokens_abstract | Abstract_sin_stopwords | Abstract_sin_stopwords2 | cluster | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | A Comparative Analysis on Smart Farming Techni... | Agriculture is considered as one of the major... | s: Smart farming, iot (IoT), IoT in agricultur... | Internet of Things (IoT) is a term which was ... | ANALYSIS AND | Hardware Architecture There are many methodol... | We express our sincere gratitude to Ms. Kuppa... | [agriculture, is, considered, as, one, of, the... | [agriculture, considered, one, major, sources,... | [agriculture, considered, one, major, sources,... | 15 |
1 | A Decision Support System for Irrigation Manag... | : Automatic irrigation scheduling systems are ... | s: decision support systems; automatic irrigat... | Water is a limiting factor in agricultural pr... | ETo (reference evapotranspiration) is obtained... | In this section, we show the results obtained... | Contributions: Conceptualization, R.T.-S., A.... | [automatic, irrigation, scheduling, systems, a... | [automatic, irrigation, scheduling, systems, h... | [automatic, irrigation, scheduling, systems, h... | 0 |
2 | A novel methodology for the development of an ... | The iot (IoT) plays a vital role in the entit... | agriculture, iot, motor automation, soil para... | Economy of any country mainly deals with the ... | USING IOT In the proposed IoT irrigation syst... | AND DISCUSSIONS The proposed IoT model result... | [the, iot, iot, plays, a, vital, role, in, the... | [iot, iot, plays, vital, role, entity, sharing... | [iot, iot, plays, vital, role, entity, sharing... | 56 | |
3 | A Smart Decision System for Digital Farming.txt | : New technologies have the potential to trans... | s: smart farming; IoT farming; agriculture sma... | Precision agriculture (PA) consists of managi... | The first step to develop our management platfo... | s and Future Work In this work, we have presen... | Contributions: C.C.B. designed the platform. ... | [new, technologies, have, the, potential, to, ... | [new, technologies, potential, transform, agri... | [new, technologies, potential, transform, agri... | 48 |
4 | A Solar-Powered Fertigation System Based on Lo... | —Nowadays, the technological innovations affec... | s—precise farming, IoT devices, fertigation sy... | THE intensive agriculture allows to satisfy t... | Aim of this section is provide an overview ab... | In this work, we have reported on the design o... | (s). This is an open-access article distribute... | [the, technological, innovations, affect, all,... | [technological, innovations, affect, human, ac... | [technological, innovations, affect, human, ac... | 13 |
#Se construye una nube por cada cluster
#https://matplotlib.org/tutorials/colors/colormaps.html tutoriales matplotlib
for cluster in data['cluster'].unique():
data_cluster = data[data['cluster'] == cluster ].reset_index()
lista_palabras = data["Abstract_sin_stopwords2"].tolist()
tokens = [keyword.strip() for sublista in lista_palabras for keyword in sublista]
texto= ' '.join(data_cluster)
wc = WordCloud( colormap="spring", min_font_size= 10,background_color='black', max_words=20,margin=0)
wc.generate(texto)
plt.figure()
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.show()
import nltk # importar natural language toolkit
nltk.download('punkt')
nltk.download('stopwords') # modulo para descargar stopwords en diferentes idiomas
import pandas as pd
import numpy as np
import string
import plotly
from nltk.stem import PorterStemmer
#LDA
from sklearn.decomposition import LatentDirichletAllocation as LDA
number_topics = 10
model = LDA(n_components=number_topics, n_jobs=-16)
model.fit(data_tfidf)
#Perplexity: captura que tan bien el modelo generado coincide con los datos proporcionados. Valores pequeños son un mejor modelo
# Calcula la probabilidad de que los documentos tengan asignados los temas asignados según la aparición de las palabras
print(model.perplexity(data_tfidf))
!pip install pyLDAvis
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
dtm = np.matrix(data_tfidf)
panel = pyLDAvis.sklearn.prepare(model, dtm, tfidf)
pyLDAvis.display(panel)
Se usarán tres modelos de análisis de tópicos.
1) Latent Dirichlet Allocation (LDA)
2) Latent semantic analysis (LSA)
3) Hierichal Dirichlet Process (HDP)
4) Latent Semantic Analysis (LSA)
Se busca evaluar los modelos teniendo en cuenta topic coherence.
data.columns
Index(['Title', 'Abstract', 'Keywords', 'Introduction', 'methodology', 'Conclusion', 'Aknowledge', 'tokens_abstract', 'Abstract_sin_stopwords', 'Abstract_sin_stopwords2', 'cluster'], dtype='object')
dictionary = Dictionary(data['Abstract_sin_stopwords2'].to_list())
dictionary.compactify() # Asignar nuevos ids para todas las palabras
# Filtrar extremos: tokens que aparecen en menos de (no_below) documents (absolute number) o mas de (no_above documents) fraccion del corpus total.
dictionary.filter_extremes(no_below=2, no_above=0.97, keep_n=None)
dictionary.compactify()
# Convertir el documento en (lista de palabras) en un formato = list of (token_id, token_count) 2-tuples
corpus = [dictionary.doc2bow(text) for text in data['Abstract_sin_stopwords2'].to_list()]
corpus
[[(0, 1), (1, 3), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 2), (12, 3), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 3), (39, 1), (40, 1), (41, 1), (42, 1), (43, 2), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 2), (50, 2), (51, 3), (52, 1), (53, 2), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 3), (62, 2), (63, 1), (64, 2), (65, 1), (66, 1)], [(6, 1), (17, 1), (42, 1), (49, 1), (60, 1), (65, 1), (67, 1), (68, 1), (69, 2), (70, 1), (71, 1), (72, 5), (73, 1), (74, 1), (75, 1), (76, 2), (77, 1), (78, 1), (79, 1), (80, 1), (81, 1), (82, 1), (83, 2), (84, 1), (85, 1), (86, 1), (87, 1), (88, 1), (89, 1), (90, 7), (91, 1), (92, 4), (93, 1), (94, 1), (95, 1), (96, 1), (97, 3), (98, 2), (99, 1), (100, 2), (101, 1), (102, 2), (103, 1), (104, 1), (105, 1), (106, 1), (107, 4), (108, 2), (109, 1), (110, 2), (111, 1), (112, 1), (113, 1), (114, 2), (115, 2), (116, 3), (117, 1), (118, 1), (119, 1), (120, 1), (121, 1), (122, 1), (123, 1)], [(1, 3), (2, 1), (6, 1), (10, 1), (13, 1), (18, 1), (22, 1), (24, 1), (26, 1), (30, 8), (44, 1), (48, 1), (63, 1), (64, 2), (65, 3), (72, 1), (83, 1), (90, 1), (98, 5), (101, 3), (106, 6), (108, 1), (115, 1), (124, 1), (125, 1), (126, 2), (127, 1), (128, 1), (129, 2), (130, 1), (131, 1), (132, 1), (133, 1), (134, 1), (135, 1), (136, 1), (137, 1), (138, 1), (139, 1), (140, 2), (141, 1), (142, 1), (143, 1), (144, 2), (145, 1), (146, 1), (147, 1), (148, 1), (149, 1), (150, 1), (151, 1), (152, 1), (153, 1), (154, 2), (155, 2), (156, 2), (157, 1), (158, 1), (159, 1), (160, 1), (161, 1), (162, 1), (163, 1), (164, 1), (165, 1), (166, 1), (167, 1), (168, 2), (169, 1), (170, 1), (171, 1), (172, 1), (173, 1), (174, 1), (175, 1), (176, 1), (177, 1), (178, 1), (179, 1), (180, 1), (181, 1), (182, 1), (183, 1), (184, 2)], [(1, 1), (6, 2), (17, 1), (22, 1), (23, 1), (30, 3), (31, 2), (35, 1), (52, 2), (68, 1), (75, 1), (76, 1), (77, 1), (90, 2), (92, 1), (95, 2), (103, 1), (104, 1), (115, 1), (116, 1), (121, 1), (142, 1), (144, 2), (164, 1), (177, 1), (185, 1), (186, 1), (187, 1), (188, 1), (189, 1), (190, 1), (191, 1), (192, 1), (193, 1), (194, 1), (195, 5), (196, 1), (197, 1), (198, 1), (199, 1), (200, 1), (201, 1), (202, 1), (203, 1), (204, 1), (205, 1), (206, 1), (207, 1), (208, 1), (209, 1), (210, 1), (211, 1), (212, 1), (213, 1), (214, 1), (215, 1), (216, 1), (217, 1), (218, 1), (219, 1), (220, 1), (221, 1), (222, 1), (223, 2), (224, 1), (225, 2), (226, 2), (227, 1), (228, 1), (229, 1), (230, 1), (231, 1), (232, 1), (233, 1), (234, 1), (235, 1), (236, 1), (237, 1), (238, 1), (239, 1), (240, 1), (241, 1), (242, 1)], [(1, 1), (51, 1), (54, 1), (106, 1), (115, 2), (121, 2), (131, 1), (133, 1), (136, 1), (143, 1), (144, 1), (147, 1), (165, 1), (168, 1), (195, 1), (201, 1), (210, 1), (216, 1), (219, 1), (223, 1), (224, 2), (238, 1), (241, 2), (243, 1), (244, 1), (245, 1), (246, 1), (247, 1), (248, 2), (249, 1), (250, 1), (251, 1), (252, 1), (253, 1), (254, 1), (255, 1), (256, 1), (257, 1), (258, 1), (259, 1), (260, 1), (261, 1), (262, 1), (263, 1), (264, 1), (265, 1), (266, 1), (267, 1), (268, 1), (269, 1), (270, 1), (271, 1), (272, 1), (273, 2)], [(1, 3), (8, 2), (9, 1), (21, 1), (23, 2), (30, 3), (65, 5), (96, 1), (98, 1), (108, 1), (143, 1), (144, 1), (149, 1), (161, 1), (175, 2), (185, 1), (197, 2), (201, 1), (211, 1), (214, 1), (238, 2), (267, 1), (274, 1), (275, 1), (276, 1), (277, 1), (278, 1), (279, 1), (280, 1), (281, 2), (282, 2), (283, 1), (284, 1), (285, 3), (286, 1), (287, 1), (288, 1), (289, 1), (290, 1), (291, 1), (292, 1), (293, 1), (294, 1), (295, 1), (296, 1), (297, 1), (298, 1), (299, 1), (300, 1), (301, 1), (302, 1), (303, 1), (304, 1), (305, 2), (306, 1), (307, 1), (308, 1), (309, 1), (310, 1), (311, 2), (312, 1), (313, 1), (314, 1), (315, 1), (316, 1), (317, 3), (318, 1), (319, 1), (320, 1), (321, 1), (322, 1), (323, 1), (324, 1), (325, 1), (326, 1), (327, 1), (328, 1), (329, 2), (330, 1), (331, 3), (332, 1), (333, 1)], [(6, 1), (9, 1), (14, 1), (24, 1), (29, 1), (42, 1), (63, 1), (65, 1), (68, 1), (90, 5), (103, 1), (106, 1), (115, 3), (116, 1), (122, 1), (144, 1), (154, 1), (168, 2), (172, 1), (195, 2), (210, 1), (218, 1), (221, 1), (242, 1), (248, 1), (253, 2), (262, 1), (267, 1), (279, 1), (322, 1), (334, 1), (335, 1), (336, 1), (337, 1), (338, 2), (339, 1), (340, 1), (341, 1), (342, 1), (343, 1), (344, 1), (345, 1), (346, 3), (347, 1), (348, 2), (349, 1), (350, 3), (351, 1), (352, 1), (353, 1), (354, 1), (355, 1), (356, 1), (357, 1), (358, 1), (359, 2), (360, 2), (361, 1), (362, 1), (363, 1), (364, 1), (365, 1), (366, 1), (367, 1), (368, 1), (369, 1), (370, 1), (371, 1), (372, 1), (373, 1), (374, 1)], [(1, 1), (6, 1), (8, 1), (12, 2), (22, 1), (30, 4), (35, 1), (41, 1), (51, 1), (52, 1), (56, 1), (61, 1), (68, 1), (76, 1), (98, 2), (100, 1), (104, 4), (108, 1), (115, 2), (116, 1), (121, 2), (127, 1), (137, 1), (138, 1), (142, 1), (145, 1), (148, 1), (157, 1), (161, 1), (168, 2), (172, 1), (185, 1), (187, 1), (196, 1), (197, 1), (198, 1), (199, 1), (219, 1), (226, 1), (263, 3), (266, 1), (267, 1), (272, 2), (284, 1), (303, 1), (311, 1), (324, 2), (335, 1), (347, 1), (353, 1), (375, 1), (376, 1), (377, 2), (378, 1), (379, 1), (380, 1), (381, 1), (382, 1), (383, 1), (384, 1), (385, 1), (386, 1), (387, 1), (388, 1), (389, 1), (390, 1), (391, 1), (392, 3), (393, 1), (394, 1), (395, 1), (396, 1), (397, 1), (398, 1), (399, 1), (400, 1), (401, 1), (402, 1), (403, 1), (404, 1), (405, 1), (406, 1), (407, 1), (408, 1), (409, 1)], [(6, 1), (17, 1), (21, 1), (22, 1), (24, 1), (26, 1), (29, 1), (30, 2), (31, 1), (43, 1), (64, 1), (65, 2), (72, 2), (90, 5), (95, 1), (108, 2), (110, 1), (115, 2), (120, 1), (121, 1), (129, 1), (134, 1), (137, 3), (142, 1), (144, 2), (151, 1), (162, 2), (165, 1), (176, 1), (181, 1), (194, 1), (195, 1), (210, 1), (219, 1), (231, 1), (254, 3), (271, 1), (290, 1), (294, 1), (308, 1), (335, 1), (346, 1), (366, 1), (371, 1), (403, 1), (405, 1), (410, 1), (411, 1), (412, 1), (413, 1), (414, 1), (415, 1), (416, 1), (417, 1), (418, 1), (419, 1), (420, 1), (421, 2), (422, 1), (423, 1), (424, 1), (425, 1), (426, 2), (427, 1), (428, 1), (429, 1), (430, 1), (431, 1), (432, 1), (433, 1), (434, 1)], [(1, 1), (6, 1), (23, 1), (30, 1), (31, 3), (36, 1), (51, 3), (61, 1), (63, 1), (64, 2), (65, 3), (70, 1), (87, 1), (90, 1), (93, 1), (103, 2), (105, 1), (106, 2), (115, 2), (117, 1), (120, 1), (121, 1), (137, 3), (142, 2), (144, 2), (152, 1), (154, 2), (156, 1), (157, 1), (163, 1), (168, 4), (172, 2), (174, 1), (177, 1), (184, 1), (188, 2), (199, 1), (208, 1), (210, 2), (217, 2), (219, 1), (220, 1), (238, 2), (243, 2), (248, 1), (272, 1), (284, 1), (324, 1), (325, 1), (366, 1), (371, 1), (385, 1), (390, 1), (399, 1), (403, 5), (406, 1), (407, 1), (413, 1), (429, 1), (434, 1), (435, 1), (436, 1), (437, 1), (438, 1), (439, 1), (440, 1), (441, 1), (442, 1), (443, 1), (444, 1), (445, 1), (446, 1), (447, 1), (448, 1), (449, 1), (450, 1), (451, 1), (452, 1), (453, 1), (454, 1), (455, 1), (456, 1), (457, 1), (458, 1)], [(6, 1), (13, 1), (17, 2), (22, 2), (27, 1), (30, 2), (49, 1), (51, 1), (54, 1), (55, 1), (65, 2), (68, 1), (72, 1), (90, 1), (95, 1), (96, 1), (99, 1), (103, 1), (111, 1), (116, 2), (121, 1), (127, 1), (133, 1), (134, 1), (135, 1), (138, 2), (142, 1), (148, 1), (155, 1), (168, 1), (195, 1), (202, 2), (210, 1), (217, 1), (219, 2), (220, 1), (238, 1), (262, 2), (263, 1), (272, 2), (279, 1), (288, 1), (315, 1), (353, 1), (354, 1), (366, 1), (371, 1), (389, 1), (395, 1), (403, 2), (407, 1), (440, 1), (459, 1), (460, 1), (461, 1), (462, 1), (463, 1), (464, 1), (465, 1), (466, 1), (467, 1), (468, 1), (469, 1), (470, 1), (471, 1), (472, 1), (473, 1), (474, 1), (475, 1), (476, 1), (477, 1), (478, 1), (479, 1), (480, 1), (481, 1), (482, 1)], [(1, 2), (19, 1), (30, 2), (41, 1), (42, 1), (53, 1), (65, 3), (68, 2), (90, 2), (95, 1), (97, 1), (103, 1), (106, 1), (108, 1), (112, 2), (115, 1), (116, 1), (136, 1), (137, 1), (141, 1), (145, 1), (158, 1), (175, 1), (199, 1), (220, 4), (290, 1), (319, 1), (322, 1), (324, 1), (342, 1), (361, 1), (370, 1), (376, 1), (387, 1), (399, 1), (412, 2), (420, 1), (436, 2), (441, 1), (445, 1), (476, 1), (483, 1), (484, 1), (485, 1), (486, 1), (487, 2), (488, 1), (489, 1), (490, 1), (491, 1), (492, 1), (493, 3), (494, 1), (495, 1), (496, 1), (497, 1), (498, 1), (499, 1), (500, 1), (501, 1), (502, 1)], [(1, 1), (6, 1), (22, 2), (23, 1), (26, 1), (51, 2), (52, 1), (59, 1), (68, 1), (70, 1), (72, 2), (75, 1), (76, 1), (81, 1), (85, 1), (90, 5), (92, 1), (94, 1), (97, 1), (98, 1), (99, 1), (106, 1), (108, 1), (114, 2), (115, 1), (116, 1), (118, 1), (119, 1), (121, 2), (129, 1), (132, 1), (136, 1), (137, 1), (168, 2), (172, 1), (187, 1), (188, 1), (199, 1), (216, 1), (217, 1), (220, 2), (226, 2), (234, 1), (238, 1), (241, 1), (253, 2), (258, 1), (260, 1), (262, 2), (272, 1), (284, 1), (290, 1), (316, 1), (331, 2), (332, 1), (334, 1), (340, 1), (388, 1), (395, 1), (403, 1), (415, 1), (423, 1), (471, 1), (481, 2), (482, 1), (496, 1), (503, 1), (504, 1), (505, 1), (506, 1), (507, 1), (508, 1), (509, 1), (510, 1), (511, 1), (512, 1), (513, 1), (514, 1), (515, 1), (516, 1), (517, 1), (518, 1), (519, 1), (520, 1), (521, 1), (522, 1), (523, 1), (524, 1), (525, 1), (526, 1), (527, 1), (528, 1), (529, 1), (530, 1), (531, 1), (532, 1), (533, 1), (534, 1)], [(1, 2), (6, 1), (23, 1), (30, 4), (42, 2), (45, 1), (48, 1), (52, 1), (56, 1), (65, 2), (90, 2), (95, 2), (103, 1), (131, 1), (137, 2), (139, 1), (142, 2), (148, 1), (162, 1), (169, 3), (195, 4), (210, 1), (221, 5), (230, 1), (238, 3), (240, 1), (243, 1), (253, 1), (262, 1), (265, 2), (268, 2), (281, 1), (294, 1), (318, 1), (324, 1), (331, 1), (344, 1), (346, 1), (382, 1), (408, 1), (413, 1), (427, 1), (451, 1), (465, 1), (475, 1), (478, 1), (503, 1), (513, 2), (535, 1), (536, 1), (537, 1), (538, 1), (539, 1), (540, 1), (541, 1), (542, 1), (543, 1), (544, 1), (545, 1), (546, 1), (547, 1), (548, 1), (549, 2), (550, 3), (551, 1), (552, 1), (553, 1), (554, 1), (555, 1), (556, 1), (557, 1)], [(6, 3), (29, 1), (30, 2), (42, 1), (60, 1), (65, 5), (95, 2), (103, 2), (115, 5), (166, 1), (182, 1), (188, 2), (197, 3), (209, 2), (224, 1), (256, 1), (317, 2), (318, 1), (324, 4), (344, 1), (387, 1), (424, 1), (450, 1), (519, 1), (548, 1), (558, 1), (559, 1), (560, 1), (561, 1), (562, 1), (563, 1), (564, 1), (565, 1), (566, 1), (567, 1), (568, 1), (569, 1), (570, 1), (571, 1), (572, 1), (573, 1), (574, 1), (575, 1)], [(11, 1), (14, 1), (21, 1), (36, 1), (52, 1), (61, 1), (65, 5), (76, 1), (90, 6), (94, 1), (95, 3), (101, 1), (103, 1), (108, 1), (114, 1), (115, 1), (125, 1), (142, 2), (152, 1), (188, 1), (197, 1), (202, 1), (208, 1), (210, 1), (218, 1), (248, 1), (263, 1), (267, 3), (268, 1), (272, 1), (303, 1), (311, 1), (317, 1), (332, 1), (347, 1), (378, 1), (401, 1), (407, 1), (413, 1), (442, 1), (468, 1), (488, 1), (576, 1), (577, 1), (578, 1), (579, 2), (580, 1), (581, 1), (582, 1), (583, 1), (584, 2), (585, 1), (586, 1), (587, 1), (588, 1), (589, 1), (590, 1), (591, 1), (592, 1)], [(1, 2), (11, 1), (23, 2), (30, 2), (42, 1), (91, 1), (103, 1), (141, 1), (145, 1), (200, 1), (208, 1), (253, 1), (351, 1), (396, 1), (399, 1), (405, 1), (446, 1), (451, 1), (463, 2), (478, 1), (528, 1), (545, 1), (548, 1), (589, 2), (592, 2), (593, 1), (594, 1), (595, 2), (596, 2), (597, 2), (598, 1), (599, 1), (600, 1), (601, 1), (602, 3), (603, 1), (604, 1), (605, 2), (606, 1), (607, 2), (608, 1), (609, 1), (610, 1), (611, 1), (612, 1), (613, 2), (614, 1)], [(6, 1), (13, 1), (33, 1), (44, 1), (51, 1), (55, 2), (67, 1), (69, 3), (73, 2), (79, 1), (81, 1), (84, 1), (90, 8), (92, 1), (95, 1), (98, 4), (102, 1), (104, 3), (107, 3), (121, 2), (124, 1), (137, 1), (154, 1), (168, 2), (184, 3), (195, 9), (216, 3), (217, 2), (226, 1), (231, 1), (241, 2), (248, 1), (271, 1), (282, 1), (311, 1), (322, 1), (331, 1), (354, 1), (376, 1), (389, 1), (398, 1), (409, 1), (413, 4), (469, 1), (484, 1), (496, 1), (497, 2), (499, 1), (503, 3), (509, 1), (511, 2), (518, 1), (524, 1), (527, 1), (531, 3), (551, 2), (591, 1), (592, 1), (615, 1), (616, 1), (617, 2), (618, 1), (619, 1), (620, 1), (621, 1), (622, 3), (623, 1), (624, 1), (625, 1), (626, 1), (627, 1), (628, 1), (629, 1), (630, 1), (631, 1), (632, 1), (633, 1), (634, 2), (635, 1), (636, 1), (637, 1), (638, 1), (639, 1), (640, 3), (641, 1), (642, 2), (643, 1), (644, 1), (645, 1), (646, 1)], [(1, 2), (8, 2), (23, 1), (65, 1), (68, 1), (90, 2), (106, 1), (113, 2), (115, 2), (120, 1), (137, 1), (142, 1), (183, 1), (184, 1), (195, 2), (212, 1), (244, 1), (296, 1), (315, 1), (324, 2), (400, 1), (436, 1), (444, 1), (448, 1), (460, 1), (515, 2), (604, 2), (647, 1), (648, 1), (649, 1), (650, 1), (651, 1), (652, 1), (653, 2), (654, 1), (655, 1), (656, 1), (657, 1), (658, 1), (659, 1), (660, 1)], [(6, 1), (23, 1), (24, 1), (30, 2), (31, 1), (51, 1), (52, 2), (63, 2), (65, 4), (87, 1), (90, 2), (103, 2), (104, 1), (106, 1), (115, 4), (116, 1), (120, 1), (121, 1), (131, 1), (136, 1), (144, 1), (147, 2), (151, 1), (155, 1), (162, 1), (176, 1), (181, 1), (183, 1), (195, 3), (196, 1), (199, 1), (217, 1), (228, 1), (247, 1), (249, 1), (272, 1), (324, 1), (403, 1), (411, 1), (453, 1), (465, 1), (471, 1), (478, 1), (507, 1), (532, 1), (534, 1), (541, 1), (601, 1), (619, 1), (625, 2), (626, 1), (661, 1), (662, 1), (663, 1), (664, 1), (665, 1), (666, 2), (667, 2), (668, 2), (669, 1), (670, 1), (671, 1), (672, 1), (673, 1), (674, 1), (675, 1), (676, 1), (677, 1), (678, 2), (679, 1), (680, 1), (681, 1), (682, 1), (683, 1), (684, 1), (685, 1)], [(1, 4), (14, 2), (17, 2), (21, 1), (30, 1), (33, 1), (51, 1), (66, 1), (68, 4), (81, 1), (95, 1), (103, 1), (106, 1), (108, 1), (111, 1), (113, 1), (115, 4), (116, 2), (127, 1), (131, 1), (133, 1), (134, 1), (139, 1), (187, 1), (190, 1), (192, 1), (195, 2), (197, 1), (217, 2), (219, 1), (220, 2), (238, 4), (253, 1), (272, 1), (284, 1), (294, 1), (317, 1), (331, 1), (346, 1), (347, 1), (357, 1), (360, 1), (374, 1), (376, 1), (394, 1), (395, 1), (403, 2), (494, 1), (496, 3), (499, 1), (513, 1), (516, 1), (543, 1), (546, 1), (587, 1), (590, 2), (592, 1), (603, 1), (617, 1), (640, 1), (644, 1), (663, 2), (686, 1), (687, 1), (688, 1), (689, 1), (690, 1), (691, 1), (692, 1), (693, 2), (694, 1), (695, 1), (696, 1)], [(19, 1), (22, 1), (23, 2), (30, 1), (35, 1), (52, 1), (61, 1), (62, 1), (76, 1), (84, 1), (90, 1), (95, 2), (106, 1), (131, 1), (136, 1), (137, 1), (142, 1), (160, 1), (179, 1), (184, 1), (192, 1), (202, 3), (217, 1), (232, 1), (265, 1), (324, 1), (330, 1), (334, 1), (347, 1), (366, 1), (371, 1), (407, 1), (427, 1), (478, 2), (493, 1), (534, 1), (541, 1), (583, 1), (587, 1), (612, 1), (661, 1), (697, 1), (698, 1), (699, 1), (700, 1), (701, 1), (702, 3), (703, 1), (704, 1), (705, 1), (706, 1), (707, 1), (708, 1), (709, 1), (710, 1), (711, 1), (712, 1), (713, 1)], [(1, 3), (13, 2), (19, 1), (22, 1), (26, 1), (30, 5), (31, 1), (32, 1), (35, 1), (49, 1), (52, 2), (63, 1), (68, 3), (74, 1), (81, 1), (90, 1), (103, 2), (106, 1), (115, 3), (119, 1), (121, 4), (137, 5), (146, 2), (147, 2), (152, 1), (154, 1), (162, 1), (172, 1), (184, 1), (188, 1), (194, 2), (200, 1), (208, 1), (210, 2), (214, 1), (217, 2), (219, 1), (229, 1), (238, 1), (248, 1), (272, 1), (332, 1), (344, 1), (346, 1), (386, 1), (388, 1), (403, 1), (419, 1), (431, 1), (465, 1), (478, 1), (676, 1), (681, 2), (691, 1), (693, 1), (709, 1), (714, 1), (715, 2), (716, 2), (717, 1), (718, 1), (719, 1), (720, 1), (721, 1), (722, 1), (723, 1), (724, 1), (725, 1), (726, 1), (727, 1), (728, 1), (729, 2), (730, 1), (731, 1), (732, 1)], [(1, 3), (11, 1), (14, 1), (25, 1), (52, 1), (65, 2), (71, 1), (72, 1), (90, 2), (106, 2), (115, 2), (116, 1), (165, 1), (175, 1), (183, 1), (190, 1), (220, 1), (230, 1), (294, 1), (314, 1), (324, 1), (329, 1), (346, 1), (367, 1), (371, 1), (384, 1), (399, 2), (455, 1), (475, 1), (493, 1), (495, 2), (535, 1), (605, 1), (607, 1), (619, 1), (623, 1), (648, 1), (699, 2), (717, 1), (733, 1), (734, 1), (735, 1), (736, 1), (737, 1), (738, 1), (739, 1), (740, 1), (741, 1), (742, 2), (743, 1), (744, 1), (745, 1), (746, 1), (747, 1), (748, 1), (749, 1)], [(0, 1), (1, 1), (14, 1), (17, 1), (29, 1), (30, 2), (34, 1), (37, 1), (52, 2), (61, 2), (63, 1), (64, 1), (68, 1), (95, 1), (115, 1), (136, 1), (142, 3), (161, 1), (187, 1), (195, 1), (197, 2), (204, 1), (217, 1), (220, 1), (238, 1), (243, 1), (303, 1), (308, 1), (324, 3), (344, 1), (399, 3), (419, 2), (425, 1), (441, 1), (443, 1), (558, 1), (561, 1), (580, 1), (670, 1), (715, 2), (716, 1), (741, 1), (750, 1), (751, 1), (752, 1), (753, 1), (754, 1), (755, 1), (756, 1), (757, 1), (758, 1), (759, 1), (760, 1), (761, 1), (762, 2), (763, 1), (764, 1), (765, 1), (766, 1), (767, 1), (768, 1)], [(7, 1), (13, 1), (16, 1), (22, 3), (23, 2), (29, 1), (36, 1), (65, 2), (68, 1), (75, 1), (76, 4), (83, 2), (89, 1), (90, 20), (94, 1), (95, 3), (103, 1), (108, 1), (109, 1), (110, 1), (113, 1), (114, 5), (115, 2), (116, 5), (121, 1), (137, 1), (142, 3), (168, 1), (195, 1), (196, 1), (199, 2), (202, 2), (230, 1), (232, 1), (239, 3), (240, 3), (248, 1), (249, 3), (251, 2), (255, 1), (258, 1), (261, 1), (315, 1), (331, 4), (333, 1), (344, 3), (346, 1), (369, 2), (371, 2), (376, 1), (389, 1), (405, 2), (441, 1), (460, 1), (463, 1), (478, 3), (493, 1), (497, 1), (529, 1), (551, 1), (559, 1), (597, 1), (609, 1), (619, 1), (666, 3), (694, 1), (704, 1), (721, 1), (769, 1), (770, 1), (771, 1), (772, 1), (773, 2), (774, 2), (775, 1), (776, 1), (777, 1), (778, 1), (779, 2), (780, 1), (781, 1), (782, 1), (783, 1), (784, 4), (785, 1), (786, 1), (787, 2), (788, 1), (789, 1), (790, 1), (791, 3), (792, 1), (793, 1)], [(1, 7), (22, 3), (30, 5), (48, 1), (87, 2), (90, 2), (95, 4), (98, 1), (106, 1), (111, 1), (113, 1), (190, 1), (195, 1), (222, 4), (238, 1), (248, 1), (271, 1), (283, 1), (301, 1), (322, 3), (334, 1), (338, 1), (342, 1), (384, 2), (396, 1), (413, 1), (442, 1), (476, 1), (480, 1), (481, 2), (496, 1), (514, 1), (521, 1), (523, 1), (529, 1), (536, 3), (540, 1), (592, 1), (635, 1), (639, 1), (650, 1), (675, 1), (682, 1), (690, 1), (695, 1), (730, 1), (734, 1), (759, 1), (763, 1), (769, 1), (794, 2), (795, 1), (796, 1), (797, 1), (798, 1), (799, 1), (800, 6), (801, 1), (802, 1), (803, 1)], [(1, 1), (3, 1), (16, 1), (41, 1), (52, 4), (63, 1), (65, 6), (71, 1), (72, 2), (80, 1), (90, 7), (100, 2), (108, 1), (115, 4), (116, 4), (121, 1), (136, 1), (137, 1), (138, 2), (152, 1), (154, 1), (158, 2), (168, 1), (194, 1), (197, 3), (199, 2), (200, 1), (205, 1), (208, 1), (210, 1), (245, 1), (249, 3), (269, 1), (294, 2), (317, 1), (331, 2), (338, 1), (381, 1), (413, 1), (430, 1), (481, 1), (493, 1), (503, 1), (516, 1), (524, 1), (526, 1), (528, 1), (531, 2), (548, 1), (555, 1), (616, 2), (634, 2), (734, 1), (771, 1), (781, 3), (785, 1), (786, 1), (801, 1), (803, 1), (804, 1), (805, 1), (806, 1), (807, 1), (808, 1), (809, 1), (810, 1), (811, 1), (812, 1)], [(0, 1), (6, 3), (13, 1), (15, 1), (30, 2), (48, 1), (51, 1), (52, 4), (64, 1), (65, 1), (75, 1), (90, 2), (95, 4), (103, 2), (106, 1), (114, 2), (115, 5), (116, 1), (131, 1), (133, 1), (140, 1), (155, 2), (160, 1), (175, 1), (188, 1), (194, 1), (197, 2), (217, 2), (238, 1), (248, 1), (251, 1), (282, 1), (324, 2), (346, 3), (372, 1), (406, 1), (412, 1), (465, 1), (473, 1), (514, 1), (516, 1), (540, 1), (555, 1), (573, 1), (598, 1), (601, 1), (605, 1), (647, 1), (660, 4), (667, 1), (751, 1), (759, 1), (770, 1), (813, 1), (814, 1), (815, 1), (816, 1), (817, 1), (818, 1), (819, 1)], [(1, 1), (6, 1), (10, 1), (13, 1), (31, 1), (51, 1), (63, 1), (65, 2), (79, 1), (89, 1), (90, 2), (100, 1), (103, 1), (115, 1), (121, 1), (137, 1), (144, 3), (154, 4), (168, 6), (171, 1), (172, 4), (177, 1), (178, 1), (195, 3), (197, 1), (214, 1), (241, 1), (246, 1), (249, 1), (258, 1), (294, 1), (302, 1), (331, 1), (332, 1), (347, 1), (348, 1), (353, 1), (359, 1), (371, 1), (380, 2), (388, 1), (423, 1), (445, 1), (476, 1), (513, 1), (548, 2), (661, 1), (677, 1), (680, 1), (706, 1), (734, 1), (752, 1), (792, 1), (820, 1), (821, 3), (822, 3), (823, 1), (824, 2), (825, 1), (826, 1), (827, 1), (828, 1), (829, 1), (830, 1), (831, 2)], [(20, 1), (23, 2), (60, 1), (68, 1), (86, 1), (105, 1), (114, 1), (115, 3), (116, 1), (137, 1), (142, 1), (148, 1), (168, 1), (190, 1), (191, 1), (212, 1), (217, 1), (250, 1), (265, 1), (281, 1), (404, 1), (413, 1), (456, 1), (462, 1), (463, 1), (481, 1), (539, 1), (568, 1), (586, 1), (718, 1), (723, 1), (755, 1), (797, 1), (832, 1), (833, 1), (834, 1), (835, 1), (836, 1)], [(3, 2), (30, 2), (42, 1), (49, 1), (51, 1), (61, 1), (63, 1), (68, 2), (72, 1), (81, 1), (90, 1), (103, 1), (106, 2), (115, 2), (118, 1), (121, 1), (133, 1), (138, 1), (145, 1), (148, 1), (165, 1), (172, 1), (183, 1), (188, 1), (193, 1), (217, 2), (219, 2), (230, 1), (238, 1), (248, 1), (263, 1), (268, 1), (324, 1), (330, 3), (331, 1), (350, 1), (390, 1), (399, 3), (424, 1), (441, 1), (465, 1), (493, 1), (495, 1), (522, 1), (524, 1), (574, 1), (615, 1), (712, 1), (798, 1), (824, 1), (837, 1), (838, 1), (839, 1), (840, 1), (841, 1), (842, 1), (843, 1)], [(6, 4), (22, 1), (24, 1), (29, 1), (30, 6), (32, 1), (38, 1), (41, 1), (48, 1), (51, 1), (52, 2), (58, 1), (65, 12), (83, 1), (90, 6), (97, 1), (105, 1), (115, 4), (121, 1), (140, 1), (142, 3), (160, 1), (161, 2), (171, 1), (176, 2), (179, 1), (191, 1), (197, 1), (214, 1), (219, 1), (265, 1), (272, 1), (325, 1), (332, 1), (343, 1), (371, 1), (383, 1), (391, 1), (403, 1), (421, 1), (439, 1), (443, 1), (463, 1), (478, 1), (493, 2), (495, 1), (567, 1), (570, 1), (597, 2), (606, 1), (697, 1), (699, 1), (712, 2), (718, 1), (741, 1), (799, 1), (818, 1), (826, 1), (844, 1), (845, 1), (846, 1), (847, 1), (848, 1), (849, 1), (850, 1), (851, 1), (852, 1), (853, 1), (854, 1), (855, 1), (856, 1)], [(1, 4), (3, 1), (6, 3), (21, 1), (23, 3), (30, 3), (52, 1), (74, 1), (105, 1), (106, 2), (217, 1), (219, 4), (220, 1), (279, 1), (324, 2), (331, 1), (396, 1), (402, 1), (428, 1), (478, 2), (514, 1), (516, 1), (541, 1), (555, 1), (571, 1), (579, 1), (586, 1), (625, 1), (648, 2), (671, 1), (686, 1), (745, 1), (803, 1), (813, 1), (819, 1), (844, 1), (847, 1), (857, 1), (858, 1), (859, 1), (860, 1), (861, 1)], [(6, 1), (8, 1), (30, 2), (65, 2), (66, 1), (68, 1), (76, 1), (90, 2), (103, 1), (106, 1), (113, 2), (116, 3), (121, 1), (134, 1), (183, 1), (188, 1), (226, 1), (240, 1), (241, 1), (311, 1), (366, 1), (476, 1), (516, 1), (568, 1), (620, 1), (669, 1), (734, 1), (809, 1), (811, 1), (844, 2), (845, 1), (862, 1), (863, 1), (864, 1), (865, 1), (866, 1), (867, 1), (868, 1)], [(1, 1), (2, 1), (6, 1), (8, 1), (21, 1), (23, 2), (30, 9), (52, 2), (64, 4), (68, 3), (87, 1), (93, 1), (103, 1), (116, 1), (121, 1), (220, 1), (223, 1), (226, 1), (230, 1), (238, 1), (248, 1), (260, 1), (266, 1), (272, 1), (284, 3), (303, 1), (322, 1), (354, 1), (373, 2), (376, 1), (403, 1), (428, 1), (465, 1), (481, 1), (513, 1), (514, 1), (516, 1), (527, 1), (543, 2), (555, 2), (561, 1), (563, 1), (574, 1), (758, 1), (801, 1), (818, 1), (819, 2), (820, 1), (849, 1), (862, 1), (869, 2), (870, 1), (871, 1), (872, 1), (873, 1), (874, 1), (875, 1), (876, 1), (877, 1), (878, 1)], [(6, 1), (13, 2), (16, 1), (17, 1), (21, 2), (22, 4), (23, 1), (24, 3), (29, 4), (30, 6), (32, 1), (34, 2), (36, 1), (50, 1), (51, 1), (52, 3), (53, 4), (65, 8), (73, 1), (76, 3), (81, 1), (90, 25), (91, 1), (94, 1), (95, 9), (96, 1), (98, 2), (106, 1), (110, 3), (114, 3), (115, 8), (116, 2), (121, 2), (123, 1), (135, 1), (136, 2), (137, 4), (139, 1), (142, 5), (144, 3), (149, 1), (150, 1), (155, 2), (164, 1), (168, 2), (175, 1), (183, 1), (190, 1), (194, 2), (195, 6), (197, 1), (202, 1), (205, 1), (210, 4), (220, 1), (225, 2), (226, 1), (230, 5), (232, 1), (233, 1), (234, 1), (238, 2), (239, 2), (240, 2), (241, 1), (247, 1), (248, 1), (249, 2), (251, 2), (253, 1), (254, 1), (261, 2), (282, 1), (284, 1), (286, 1), (315, 2), (317, 2), (331, 3), (332, 1), (336, 1), (346, 1), (347, 1), (350, 1), (355, 1), (366, 1), (371, 6), (376, 2), (389, 1), (396, 1), (405, 1), (418, 1), (419, 1), (455, 1), (457, 1), (464, 1), (465, 2), (475, 1), (478, 2), (487, 1), (493, 1), (497, 2), (502, 2), (517, 1), (521, 1), (532, 1), (533, 1), (534, 1), (536, 2), (538, 1), (550, 1), (551, 1), (552, 1), (554, 2), (563, 1), (566, 1), (579, 1), (610, 1), (621, 1), (632, 1), (644, 2), (647, 1), (651, 1), (661, 2), (667, 2), (668, 1), (682, 1), (683, 1), (684, 1), (692, 2), (694, 1), (708, 1), (718, 2), (726, 1), (732, 1), (740, 1), (747, 1), (749, 1), (751, 1), (752, 1), (754, 1), (765, 1), (768, 2), (770, 4), (773, 2), (774, 2), (779, 12), (782, 1), (784, 2), (785, 1), (787, 7), (791, 2), (793, 7), (807, 1), (816, 1), (819, 1), (828, 1), (837, 2), (838, 1), (860, 2), (879, 1), (880, 1), (881, 1), (882, 1), (883, 1), (884, 1), (885, 2), (886, 1), (887, 1), (888, 1), (889, 1), (890, 1), (891, 1), (892, 1), (893, 1), (894, 1), (895, 1), (896, 1), (897, 1), (898, 1), (899, 1), (900, 1), (901, 3), (902, 1), (903, 1), (904, 1)], [(1, 1), (10, 1), (19, 1), (24, 1), (30, 3), (35, 1), (51, 1), (65, 1), (83, 1), (115, 2), (121, 2), (133, 1), (147, 1), (155, 1), (168, 2), (195, 1), (197, 2), (217, 1), (230, 1), (238, 1), (239, 1), (259, 1), (284, 1), (294, 1), (322, 2), (331, 1), (347, 1), (412, 1), (451, 1), (472, 1), (479, 1), (597, 1), (637, 1), (718, 1), (759, 1), (797, 1), (803, 1), (817, 1), (826, 1), (830, 1), (831, 1), (854, 1), (857, 1), (872, 1), (905, 1), (906, 1), (907, 1), (908, 2), (909, 1)], [(1, 3), (13, 1), (22, 1), (23, 1), (30, 1), (31, 6), (52, 1), (57, 1), (61, 1), (65, 5), (68, 3), (83, 1), (87, 1), (89, 1), (90, 3), (97, 1), (98, 1), (106, 3), (109, 1), (111, 1), (115, 3), (119, 1), (120, 1), (121, 1), (131, 1), (137, 1), (138, 1), (142, 1), (147, 2), (148, 1), (154, 4), (155, 1), (164, 1), (168, 3), (169, 1), (172, 4), (176, 2), (195, 1), (199, 1), (208, 2), (217, 2), (218, 1), (232, 2), (246, 1), (253, 1), (258, 1), (267, 2), (268, 1), (271, 1), (324, 1), (330, 1), (337, 1), (343, 1), (344, 1), (346, 1), (347, 1), (353, 1), (371, 2), (389, 1), (390, 1), (399, 1), (405, 2), (412, 1), (476, 1), (498, 1), (630, 1), (647, 1), (648, 1), (656, 1), (666, 2), (667, 1), (678, 5), (697, 2), (699, 1), (719, 1), (728, 3), (736, 1), (741, 1), (757, 1), (790, 1), (801, 1), (839, 3), (849, 1), (851, 2), (867, 1), (868, 1), (901, 1), (907, 1), (910, 1), (911, 1), (912, 1), (913, 3), (914, 1), (915, 1), (916, 4), (917, 1), (918, 1), (919, 1), (920, 1)], [(0, 1), (1, 2), (6, 1), (26, 1), (30, 2), (34, 1), (44, 1), (52, 2), (58, 1), (62, 1), (65, 4), (76, 1), (88, 1), (90, 4), (92, 1), (95, 1), (97, 1), (98, 2), (101, 1), (104, 1), (106, 1), (108, 1), (111, 1), (115, 5), (120, 1), (121, 2), (137, 1), (141, 1), (144, 2), (162, 1), (169, 1), (183, 1), (195, 1), (199, 1), (223, 1), (241, 1), (267, 1), (269, 1), (325, 1), (358, 1), (367, 1), (391, 1), (417, 1), (423, 1), (449, 1), (450, 1), (473, 1), (475, 1), (478, 1), (481, 1), (493, 1), (503, 1), (547, 1), (567, 1), (568, 1), (622, 1), (633, 1), (636, 1), (662, 1), (687, 1), (693, 1), (724, 1), (738, 1), (835, 1), (846, 1), (864, 1), (890, 1), (921, 1), (922, 1), (923, 1), (924, 1), (925, 2), (926, 1)], [(6, 1), (23, 3), (27, 2), (29, 1), (30, 3), (42, 1), (54, 1), (61, 2), (95, 1), (103, 1), (127, 1), (137, 1), (142, 2), (148, 1), (154, 1), (162, 1), (168, 3), (172, 1), (175, 1), (184, 1), (188, 1), (190, 1), (210, 2), (218, 1), (224, 6), (226, 1), (246, 1), (266, 1), (324, 2), (331, 1), (334, 1), (338, 1), (347, 2), (382, 1), (389, 1), (400, 1), (407, 1), (412, 1), (428, 1), (441, 1), (444, 1), (450, 1), (461, 1), (465, 1), (477, 1), (479, 1), (485, 1), (521, 1), (543, 1), (555, 1), (556, 1), (572, 1), (621, 1), (648, 2), (686, 1), (694, 1), (696, 1), (700, 1), (710, 1), (711, 1), (722, 2), (757, 1), (803, 2), (817, 1), (844, 1), (881, 1), (927, 1), (928, 1), (929, 1), (930, 1), (931, 1), (932, 1), (933, 1), (934, 1)], [(11, 1), (13, 1), (17, 1), (23, 2), (26, 1), (30, 1), (35, 1), (47, 1), (55, 1), (65, 2), (68, 1), (77, 1), (83, 2), (86, 1), (89, 2), (90, 7), (95, 6), (106, 2), (114, 1), (115, 5), (119, 1), (126, 1), (127, 1), (129, 1), (131, 1), (134, 1), (137, 3), (142, 2), (145, 2), (161, 1), (168, 2), (173, 1), (184, 2), (195, 3), (197, 1), (207, 1), (208, 1), (210, 1), (219, 1), (220, 1), (221, 1), (223, 1), (236, 1), (241, 1), (248, 1), (264, 1), (279, 1), (286, 1), (302, 1), (308, 1), (331, 1), (335, 1), (338, 1), (339, 1), (341, 1), (346, 1), (365, 1), (368, 1), (389, 1), (418, 2), (439, 1), (463, 1), (465, 1), (468, 1), (481, 1), (495, 1), (534, 1), (555, 1), (568, 1), (574, 1), (579, 1), (584, 1), (588, 1), (595, 2), (597, 1), (611, 1), (621, 1), (654, 1), (658, 1), (663, 2), (677, 1), (687, 1), (702, 1), (707, 1), (735, 1), (745, 1), (758, 2), (784, 1), (796, 1), (799, 3), (817, 1), (818, 1), (819, 1), (832, 1), (833, 2), (834, 1), (844, 1), (862, 2), (878, 1), (889, 1), (935, 1), (936, 1), (937, 1), (938, 1), (939, 2), (940, 2), (941, 1), (942, 1), (943, 1), (944, 1)], [(1, 3), (8, 1), (11, 1), (18, 1), (30, 1), (35, 1), (51, 2), (52, 1), (65, 7), (73, 1), (79, 1), (83, 1), (90, 6), (91, 1), (95, 2), (115, 1), (116, 6), (133, 1), (134, 1), (142, 1), (145, 1), (168, 1), (175, 1), (194, 1), (197, 1), (217, 1), (223, 1), (230, 1), (238, 2), (241, 1), (248, 1), (249, 1), (272, 1), (273, 1), (288, 1), (315, 1), (325, 1), (331, 1), (352, 1), (376, 1), (389, 1), (396, 1), (399, 1), (426, 1), (447, 1), (455, 1), (468, 1), (471, 1), (479, 1), (494, 2), (500, 1), (501, 1), (524, 1), (537, 1), (542, 1), (548, 1), (565, 1), (575, 1), (585, 2), (592, 1), (601, 1), (621, 1), (704, 1), (718, 1), (730, 1), (765, 1), (790, 1), (805, 2), (815, 1), (845, 1), (878, 1), (882, 1), (945, 1), (946, 2), (947, 1), (948, 1), (949, 1), (950, 1), (951, 1)], [(17, 1), (19, 1), (22, 1), (30, 4), (34, 1), (61, 1), (65, 4), (90, 3), (92, 1), (106, 2), (113, 1), (115, 6), (116, 1), (117, 2), (120, 1), (126, 1), (128, 1), (134, 1), (137, 2), (139, 1), (142, 1), (144, 1), (147, 1), (150, 1), (164, 1), (175, 1), (180, 1), (186, 1), (189, 1), (195, 2), (199, 1), (210, 1), (221, 1), (238, 1), (249, 1), (257, 1), (271, 1), (279, 1), (297, 1), (365, 1), (395, 1), (438, 1), (467, 1), (484, 1), (485, 1), (505, 1), (511, 2), (513, 1), (524, 1), (566, 1), (576, 1), (580, 1), (618, 2), (622, 1), (633, 1), (663, 1), (672, 1), (674, 1), (676, 1), (686, 1), (688, 1), (758, 2), (761, 1), (770, 1), (781, 1), (785, 1), (788, 1), (841, 1), (870, 1), (880, 1), (885, 1), (925, 2), (932, 1), (944, 2), (952, 1), (953, 1), (954, 1), (955, 1), (956, 1), (957, 1), (958, 1), (959, 1), (960, 1), (961, 1), (962, 1), (963, 1), (964, 1), (965, 1), (966, 1), (967, 1)], [(1, 1), (10, 2), (12, 1), (22, 1), (30, 2), (31, 2), (42, 1), (52, 1), (61, 1), (63, 1), (65, 3), (68, 3), (76, 1), (90, 2), (106, 2), (115, 6), (121, 1), (138, 1), (142, 1), (144, 3), (152, 1), (154, 4), (156, 3), (168, 2), (169, 1), (188, 1), (193, 1), (195, 1), (209, 1), (210, 3), (217, 1), (238, 1), (262, 1), (265, 1), (317, 1), (324, 1), (328, 1), (403, 2), (405, 1), (425, 1), (441, 1), (474, 2), (524, 1), (534, 1), (556, 1), (648, 1), (666, 1), (678, 2), (720, 1), (748, 1), (756, 3), (762, 1), (770, 1), (778, 1), (781, 1), (803, 1), (840, 1), (851, 1), (863, 1), (872, 1), (894, 2), (908, 1), (910, 1), (912, 1), (918, 1), (965, 1), (968, 1), (969, 1), (970, 2), (971, 1)], [(1, 4), (5, 1), (6, 1), (11, 1), (17, 1), (28, 1), (30, 1), (32, 1), (34, 1), (36, 1), (46, 1), (68, 5), (77, 2), (83, 1), (87, 1), (89, 1), (90, 1), (96, 1), (103, 1), (106, 2), (109, 1), (115, 3), (120, 1), (121, 1), (122, 1), (123, 1), (138, 2), (142, 5), (154, 1), (172, 1), (180, 1), (194, 3), (197, 1), (199, 1), (223, 1), (230, 1), (237, 1), (238, 1), (241, 7), (248, 2), (267, 2), (271, 1), (312, 1), (315, 1), (324, 1), (331, 1), (342, 1), (350, 2), (362, 1), (371, 4), (375, 1), (390, 1), (396, 1), (399, 2), (411, 2), (414, 1), (417, 1), (419, 1), (430, 1), (437, 1), (443, 2), (450, 1), (451, 1), (455, 1), (475, 1), (493, 1), (496, 1), (514, 1), (574, 1), (581, 1), (587, 1), (603, 1), (642, 2), (648, 1), (653, 3), (655, 1), (657, 1), (675, 2), (699, 1), (703, 1), (723, 1), (730, 1), (749, 3), (753, 1), (761, 1), (766, 1), (780, 1), (781, 1), (783, 1), (800, 3), (802, 1), (808, 1), (810, 3), (829, 1), (834, 1), (848, 1), (856, 1), (858, 1), (883, 1), (894, 1), (920, 2), (935, 1), (936, 1), (938, 1), (947, 1), (963, 1), (972, 1), (973, 1), (974, 1), (975, 1), (976, 1), (977, 1), (978, 1)], [(1, 4), (3, 1), (22, 1), (25, 2), (30, 3), (31, 1), (32, 1), (33, 3), (36, 1), (61, 1), (65, 1), (68, 1), (70, 1), (90, 3), (98, 1), (115, 1), (119, 2), (120, 1), (121, 1), (122, 1), (137, 1), (138, 3), (142, 2), (147, 2), (161, 2), (164, 2), (169, 1), (177, 1), (181, 1), (195, 1), (219, 2), (248, 3), (249, 1), (258, 1), (269, 1), (291, 1), (315, 1), (324, 1), (329, 1), (343, 1), (371, 1), (393, 1), (403, 1), (406, 2), (413, 1), (417, 1), (426, 1), (428, 1), (434, 1), (435, 1), (437, 1), (451, 1), (477, 1), (489, 1), (490, 1), (493, 1), (494, 1), (504, 1), (544, 1), (567, 1), (570, 3), (574, 1), (578, 1), (592, 1), (603, 1), (604, 1), (607, 3), (653, 4), (655, 1), (662, 1), (673, 1), (675, 1), (679, 2), (687, 1), (691, 1), (697, 2), (699, 1), (700, 1), (705, 1), (729, 1), (733, 1), (737, 1), (738, 1), (739, 1), (744, 1), (746, 1), (749, 8), (750, 1), (760, 1), (812, 1), (818, 1), (843, 1), (869, 1), (870, 1), (879, 1), (881, 1), (887, 1), (904, 1), (914, 1), (916, 1), (921, 1), (926, 1), (929, 1), (941, 1), (946, 1), (958, 1), (963, 1), (979, 1), (980, 1), (981, 1), (982, 1)], [(6, 2), (30, 2), (34, 1), (44, 3), (52, 1), (63, 1), (76, 1), (77, 2), (90, 8), (92, 3), (98, 3), (105, 1), (106, 2), (107, 1), (108, 2), (113, 1), (115, 4), (116, 1), (118, 1), (131, 1), (194, 1), (195, 1), (199, 2), (216, 1), (268, 1), (357, 1), (397, 1), (403, 1), (413, 2), (423, 1), (424, 1), (478, 2), (488, 1), (518, 1), (522, 2), (531, 1), (539, 1), (591, 1), (618, 1), (630, 1), (633, 1), (641, 1), (643, 1), (644, 1), (666, 1), (686, 1), (693, 1), (698, 1), (701, 1), (727, 1), (762, 1), (821, 1), (919, 1), (960, 1), (964, 1), (972, 1), (983, 1), (984, 1)], [(1, 1), (6, 2), (33, 1), (48, 1), (51, 1), (64, 1), (68, 1), (72, 1), (89, 1), (90, 2), (93, 1), (100, 1), (106, 1), (115, 3), (118, 1), (121, 1), (137, 3), (142, 2), (144, 3), (160, 1), (164, 1), (168, 4), (172, 2), (178, 1), (188, 2), (210, 1), (217, 2), (230, 1), (313, 1), (317, 1), (338, 1), (390, 1), (399, 1), (403, 4), (421, 1), (441, 1), (452, 2), (476, 1), (532, 1), (605, 1), (653, 1), (672, 1), (762, 2), (817, 1), (872, 1), (931, 1), (935, 1), (967, 1), (985, 1), (986, 1), (987, 1)], [(11, 1), (17, 1), (21, 1), (30, 6), (32, 1), (36, 2), (41, 1), (52, 1), (53, 2), (65, 3), (81, 1), (95, 2), (103, 1), (105, 1), (116, 3), (121, 1), (134, 1), (140, 2), (212, 1), (217, 1), (230, 1), (238, 1), (260, 1), (267, 2), (282, 1), (324, 1), (330, 1), (379, 1), (414, 1), (455, 1), (481, 1), (486, 1), (491, 1), (495, 1), (536, 1), (543, 1), (549, 1), (583, 1), (585, 1), (599, 1), (643, 1), (663, 1), (708, 1), (737, 1), (741, 1), (759, 1), (765, 1), (766, 1), (773, 1), (775, 1), (777, 1), (802, 1), (814, 1), (843, 1), (853, 3), (940, 1), (947, 1), (949, 1), (980, 1), (988, 1)], [(24, 3), (34, 2), (51, 3), (61, 1), (65, 2), (66, 1), (79, 1), (103, 1), (121, 1), (131, 1), (136, 1), (139, 2), (143, 1), (154, 2), (155, 1), (168, 2), (172, 1), (174, 1), (195, 1), (219, 2), (223, 5), (224, 1), (237, 1), (241, 2), (242, 1), (246, 1), (248, 1), (270, 1), (322, 1), (324, 1), (356, 1), (364, 1), (390, 1), (392, 3), (396, 2), (399, 1), (403, 2), (422, 2), (452, 1), (474, 1), (515, 1), (543, 1), (593, 1), (609, 1), (622, 3), (634, 1), (645, 1), (648, 1), (668, 1), (712, 1), (714, 1), (748, 1), (768, 1), (859, 1), (862, 1), (931, 1), (937, 1), (971, 1), (982, 1), (989, 2), (990, 1), (991, 1)], [(3, 1), (30, 1), (51, 2), (61, 1), (65, 1), (90, 1), (97, 1), (103, 1), (106, 1), (131, 1), (137, 1), (138, 1), (139, 1), (144, 1), (154, 4), (155, 1), (162, 1), (168, 4), (195, 2), (197, 1), (206, 1), (210, 1), (214, 1), (217, 1), (224, 3), (243, 1), (249, 1), (253, 1), (255, 1), (263, 1), (265, 1), (324, 1), (345, 2), (410, 1), (418, 1), (440, 1), (455, 1), (465, 1), (503, 1), (513, 1), (524, 2), (534, 1), (541, 1), (561, 1), (622, 1), (648, 1), (659, 1), (663, 1), (701, 2), (709, 1), (717, 1), (764, 1), (842, 1), (868, 1), (870, 1), (873, 1), (875, 1), (938, 1), (945, 1), (955, 1), (968, 1), (973, 1), (992, 1), (993, 1), (994, 1), (995, 1), (996, 1)], [(1, 1), (3, 1), (30, 2), (51, 1), (52, 1), (54, 1), (63, 1), (64, 1), (79, 1), (98, 4), (100, 1), (104, 1), (107, 1), (108, 1), (113, 1), (115, 2), (121, 1), (137, 2), (138, 1), (152, 2), (154, 1), (168, 1), (195, 2), (212, 2), (217, 1), (219, 1), (223, 1), (263, 1), (272, 2), (273, 1), (284, 1), (324, 1), (326, 1), (335, 1), (371, 1), (377, 1), (396, 1), (403, 2), (424, 1), (445, 1), (480, 1), (517, 1), (529, 1), (543, 1), (548, 1), (573, 1), (596, 2), (621, 1), (661, 1), (663, 3), (749, 1), (758, 1), (794, 2), (801, 1), (819, 2), (892, 1), (906, 1), (928, 1), (971, 1), (997, 1)], [(1, 3), (4, 1), (6, 1), (24, 4), (31, 4), (33, 1), (41, 3), (50, 1), (55, 2), (61, 1), (63, 2), (64, 1), (65, 16), (83, 1), (90, 5), (105, 1), (106, 1), (108, 1), (115, 2), (119, 1), (121, 4), (123, 1), (138, 1), (141, 2), (142, 1), (144, 5), (152, 2), (154, 5), (156, 2), (168, 3), (170, 1), (194, 1), (205, 2), (214, 1), (217, 1), (230, 1), (257, 1), (265, 1), (267, 2), (271, 6), (272, 1), (324, 1), (331, 2), (338, 1), (342, 3), (352, 1), (367, 1), (395, 1), (403, 3), (413, 3), (418, 1), (422, 1), (429, 1), (442, 1), (447, 1), (454, 4), (456, 1), (478, 1), (481, 1), (482, 1), (524, 2), (567, 1), (582, 1), (619, 1), (653, 2), (654, 1), (667, 1), (673, 1), (693, 3), (725, 1), (738, 1), (757, 1), (775, 1), (802, 1), (806, 1), (835, 2), (845, 2), (871, 1), (912, 1), (918, 1), (950, 1), (952, 1), (977, 1), (993, 1), (998, 1), (999, 1), (1000, 1)], [(1, 1), (5, 1), (6, 1), (15, 1), (22, 1), (30, 5), (31, 1), (32, 1), (51, 1), (65, 2), (66, 1), (90, 1), (92, 1), (108, 1), (115, 4), (119, 1), (121, 1), (129, 1), (138, 2), (142, 3), (143, 2), (144, 2), (148, 1), (154, 1), (168, 2), (172, 1), (173, 1), (176, 3), (188, 3), (193, 2), (195, 3), (202, 1), (215, 1), (222, 1), (224, 1), (232, 1), (265, 1), (294, 1), (324, 1), (371, 1), (390, 1), (414, 1), (416, 1), (419, 1), (441, 1), (481, 1), (510, 1), (513, 2), (532, 1), (562, 1), (569, 1), (617, 1), (624, 1), (633, 1), (648, 1), (679, 3), (708, 1), (719, 1), (720, 1), (721, 1), (723, 1), (728, 1), (729, 1), (762, 3), (772, 2), (790, 1), (813, 1), (817, 4), (826, 1), (839, 1), (849, 1), (854, 1), (855, 1), (867, 2), (876, 1), (894, 1), (905, 1), (918, 1), (931, 1), (936, 1), (951, 2), (962, 1), (981, 1), (986, 1), (989, 1), (990, 1), (994, 1), (999, 1), (1001, 2), (1002, 1), (1003, 1)], [(1, 5), (3, 1), (5, 1), (6, 1), (13, 1), (16, 1), (23, 3), (30, 2), (31, 1), (37, 4), (39, 1), (61, 3), (68, 1), (93, 1), (103, 1), (115, 2), (120, 1), (133, 1), (136, 1), (142, 2), (145, 3), (161, 2), (162, 2), (183, 1), (188, 2), (197, 2), (202, 1), (210, 2), (217, 1), (220, 1), (224, 1), (248, 2), (260, 1), (262, 1), (265, 1), (268, 1), (315, 1), (317, 1), (326, 1), (371, 1), (387, 1), (389, 1), (399, 1), (406, 1), (412, 1), (441, 1), (455, 1), (465, 1), (477, 1), (495, 1), (535, 1), (558, 1), (570, 1), (571, 1), (574, 1), (579, 2), (592, 1), (606, 1), (614, 1), (646, 1), (652, 1), (653, 1), (654, 1), (675, 1), (682, 1), (758, 1), (841, 1), (852, 1), (886, 1), (894, 1), (922, 1), (948, 1), (974, 1), (976, 1)], [(1, 4), (22, 1), (40, 1), (52, 1), (54, 1), (60, 4), (63, 1), (65, 2), (77, 1), (90, 3), (95, 1), (106, 1), (110, 1), (112, 1), (121, 1), (137, 3), (142, 2), (153, 1), (159, 1), (165, 5), (184, 2), (195, 4), (203, 1), (226, 1), (238, 1), (239, 1), (240, 1), (247, 2), (267, 1), (331, 2), (399, 1), (401, 1), (407, 1), (440, 1), (460, 1), (471, 1), (473, 1), (496, 2), (522, 1), (524, 1), (541, 1), (553, 1), (563, 1), (602, 1), (636, 1), (648, 1), (675, 1), (686, 1), (761, 1), (781, 1), (789, 1), (796, 1), (803, 5), (815, 1), (834, 2), (836, 1), (877, 1), (896, 1), (899, 2), (902, 1), (909, 2), (936, 1), (950, 1), (954, 1), (970, 1), (973, 2), (984, 1), (1004, 1), (1005, 1), (1006, 1)], [(5, 1), (6, 1), (23, 1), (29, 1), (30, 2), (31, 3), (39, 1), (45, 1), (48, 1), (51, 1), (61, 4), (63, 1), (65, 2), (68, 1), (93, 1), (94, 1), (103, 1), (115, 1), (120, 1), (131, 1), (137, 2), (142, 4), (148, 1), (154, 1), (168, 2), (172, 1), (177, 1), (195, 2), (197, 1), (210, 2), (263, 1), (265, 1), (267, 1), (315, 1), (324, 2), (332, 1), (390, 1), (399, 1), (483, 1), (484, 1), (555, 1), (563, 1), (600, 1), (629, 1), (638, 1), (673, 1), (686, 1), (699, 2), (733, 1), (741, 2), (749, 2), (759, 1), (771, 1), (781, 1), (813, 1), (817, 2), (849, 1), (856, 1), (881, 1), (903, 1), (917, 1), (930, 1), (931, 1), (957, 1), (958, 1), (965, 1)], [(1, 1), (13, 1), (22, 2), (44, 2), (72, 1), (74, 1), (81, 1), (83, 1), (89, 1), (98, 1), (103, 1), (104, 2), (106, 2), (115, 4), (119, 1), (121, 2), (137, 2), (138, 1), (139, 1), (144, 1), (146, 1), (154, 1), (155, 1), (168, 1), (188, 1), (194, 1), (196, 1), (197, 1), (241, 1), (248, 1), (260, 1), (265, 1), (331, 1), (366, 1), (371, 1), (387, 1), (403, 4), (424, 1), (459, 2), (466, 1), (472, 3), (481, 1), (486, 1), (503, 1), (520, 1), (548, 1), (549, 1), (582, 1), (627, 3), (646, 1), (662, 1), (663, 1), (667, 1), (677, 1), (678, 1), (693, 1), (717, 1), (729, 1), (731, 1), (748, 2), (758, 1), (765, 1), (776, 1), (801, 1), (803, 1), (823, 1), (833, 1), (839, 1), (846, 1), (923, 1), (932, 1), (979, 1), (985, 1), (987, 2), (1001, 1), (1004, 1), (1007, 1), (1008, 1), (1009, 1)], [(0, 1), (1, 1), (15, 1), (17, 1), (22, 2), (23, 2), (24, 1), (30, 3), (41, 1), (60, 1), (68, 1), (96, 1), (97, 1), (103, 1), (116, 1), (117, 2), (133, 1), (142, 1), (144, 1), (165, 2), (184, 1), (195, 2), (197, 2), (210, 1), (211, 3), (220, 1), (223, 1), (232, 1), (243, 1), (263, 1), (279, 1), (324, 1), (366, 2), (387, 1), (403, 1), (413, 1), (434, 1), (444, 1), (448, 1), (462, 1), (477, 2), (481, 1), (491, 1), (507, 1), (528, 1), (554, 1), (575, 1), (582, 1), (608, 1), (631, 1), (646, 1), (673, 1), (689, 1), (708, 1), (726, 1), (738, 1), (812, 1), (817, 2), (850, 1), (861, 1), (874, 1), (891, 1), (895, 1), (897, 1), (900, 1), (912, 1), (924, 1), (974, 1), (1010, 1), (1011, 1)], [(1, 1), (14, 1), (17, 2), (18, 1), (21, 1), (22, 1), (41, 1), (49, 1), (52, 3), (61, 1), (64, 2), (65, 4), (76, 1), (78, 1), (82, 3), (90, 4), (95, 2), (103, 1), (106, 5), (108, 1), (114, 1), (116, 1), (123, 1), (139, 1), (154, 1), (168, 1), (171, 1), (174, 1), (178, 1), (183, 1), (190, 1), (195, 1), (212, 1), (217, 1), (223, 1), (252, 1), (263, 1), (267, 2), (272, 1), (325, 1), (332, 1), (334, 1), (354, 1), (381, 1), (390, 1), (403, 2), (415, 1), (429, 1), (508, 2), (521, 1), (522, 1), (526, 1), (529, 1), (555, 1), (567, 1), (577, 2), (644, 1), (713, 1), (802, 1), (803, 1), (820, 1), (825, 2), (828, 2), (835, 2), (843, 1), (848, 1), (876, 1), (919, 1), (944, 1), (959, 1), (960, 2), (972, 1), (975, 1), (983, 1), (992, 1), (996, 1), (1012, 1)], [(3, 1), (13, 1), (20, 2), (32, 1), (46, 1), (52, 1), (64, 1), (65, 2), (66, 1), (68, 1), (72, 1), (79, 1), (87, 1), (90, 1), (99, 1), (104, 1), (115, 1), (129, 1), (138, 3), (140, 1), (145, 1), (152, 1), (156, 1), (167, 1), (220, 1), (248, 2), (311, 1), (317, 1), (324, 1), (330, 1), (353, 1), (371, 1), (423, 2), (448, 1), (474, 1), (493, 1), (506, 1), (516, 1), (522, 1), (564, 1), (582, 1), (609, 1), (614, 2), (648, 2), (660, 1), (712, 3), (742, 3), (767, 1), (785, 1), (806, 1), (817, 1), (838, 1), (849, 1), (862, 1), (904, 1), (911, 1), (935, 1), (940, 1), (951, 1), (961, 1), (998, 1), (1000, 1), (1010, 1), (1013, 1), (1014, 2)], [(6, 1), (26, 2), (31, 1), (41, 1), (44, 2), (51, 1), (65, 4), (70, 1), (87, 1), (89, 1), (90, 3), (100, 1), (111, 1), (115, 1), (116, 3), (121, 1), (129, 1), (130, 1), (136, 1), (154, 1), (168, 6), (194, 1), (195, 1), (210, 2), (217, 3), (230, 2), (239, 1), (248, 1), (249, 1), (262, 1), (267, 1), (269, 1), (284, 1), (298, 1), (317, 2), (324, 1), (326, 1), (331, 3), (334, 1), (343, 1), (371, 2), (376, 1), (382, 1), (395, 1), (397, 1), (401, 1), (419, 1), (471, 1), (478, 1), (481, 1), (492, 1), (493, 1), (526, 1), (528, 1), (530, 1), (533, 1), (571, 1), (583, 1), (587, 1), (597, 1), (612, 1), (648, 1), (649, 1), (674, 1), (712, 1), (718, 1), (726, 2), (866, 1), (894, 1), (912, 1), (913, 1), (915, 1), (933, 1), (935, 1), (953, 1), (979, 1), (997, 1), (1007, 2), (1008, 1), (1012, 1), (1014, 1), (1015, 1)], [(1, 3), (8, 2), (9, 1), (21, 1), (23, 2), (30, 3), (65, 5), (96, 1), (98, 1), (108, 1), (143, 1), (144, 1), (149, 1), (161, 1), (175, 2), (185, 1), (197, 2), (201, 1), (211, 1), (214, 1), (238, 2), (267, 1), (274, 1), (275, 1), (276, 1), (277, 1), (278, 1), (279, 1), (280, 1), (281, 2), (282, 2), (283, 1), (284, 1), (285, 3), (286, 1), (287, 1), (288, 1), (289, 1), (290, 1), (291, 1), (292, 1), (293, 1), (294, 1), (295, 1), (296, 1), (297, 1), (298, 1), (299, 1), (300, 1), (301, 1), (302, 1), (303, 1), (304, 1), (305, 2), (306, 1), (307, 1), (308, 1), (309, 1), (310, 1), (311, 2), (312, 1), (313, 1), (314, 1), (315, 1), (316, 1), (317, 3), (318, 1), (319, 1), (320, 1), (321, 1), (322, 1), (323, 1), (324, 1), (325, 1), (326, 1), (327, 1), (328, 1), (329, 2), (330, 1), (331, 3), (332, 1), (333, 1)], [(1, 1), (5, 1), (6, 2), (34, 1), (35, 1), (41, 2), (42, 1), (44, 1), (65, 3), (68, 1), (76, 1), (77, 2), (78, 2), (90, 7), (95, 3), (98, 1), (103, 1), (108, 1), (113, 1), (116, 2), (127, 1), (137, 6), (138, 3), (142, 2), (154, 2), (168, 3), (184, 1), (194, 1), (195, 7), (208, 1), (214, 1), (219, 1), (226, 1), (227, 2), (272, 2), (273, 3), (315, 1), (331, 1), (338, 1), (342, 1), (395, 1), (399, 2), (403, 2), (423, 6), (426, 1), (432, 1), (433, 1), (441, 1), (462, 1), (479, 1), (482, 1), (502, 1), (517, 1), (529, 2), (530, 1), (534, 1), (559, 1), (560, 1), (581, 1), (583, 1), (585, 1), (594, 1), (622, 1), (644, 2), (664, 1), (670, 1), (734, 1), (820, 1), (822, 4), (825, 1), (833, 1), (865, 1), (879, 1), (902, 1), (914, 1), (927, 1), (928, 1), (943, 1), (970, 1), (978, 1), (988, 1), (991, 2), (1006, 1), (1009, 1), (1012, 1), (1016, 1), (1017, 1)], [(1, 6), (6, 1), (11, 1), (14, 1), (20, 1), (23, 1), (27, 1), (30, 1), (37, 1), (41, 1), (48, 1), (49, 1), (55, 1), (60, 1), (62, 1), (68, 2), (70, 1), (84, 1), (104, 1), (106, 1), (115, 4), (121, 1), (129, 1), (137, 2), (138, 2), (142, 5), (144, 1), (145, 1), (152, 1), (160, 1), (164, 1), (184, 1), (188, 1), (190, 1), (217, 1), (231, 1), (235, 1), (236, 1), (240, 1), (248, 3), (284, 1), (294, 1), (303, 1), (324, 1), (363, 1), (371, 1), (372, 1), (417, 1), (426, 1), (438, 1), (442, 1), (443, 1), (458, 3), (470, 1), (502, 1), (512, 1), (524, 1), (525, 1), (532, 1), (575, 1), (613, 1), (615, 1), (617, 1), (648, 1), (661, 2), (706, 1), (712, 1), (715, 2), (718, 1), (721, 1), (722, 1), (743, 1), (745, 1), (757, 1), (760, 3), (764, 1), (804, 1), (805, 1), (817, 2), (837, 1), (841, 2), (862, 1), (881, 1), (884, 1), (893, 1), (934, 1), (939, 1), (979, 1), (1011, 1), (1013, 1), (1015, 1), (1016, 1), (1017, 1)], [(6, 1), (23, 3), (34, 1), (36, 1), (42, 1), (51, 3), (61, 1), (65, 3), (68, 1), (77, 1), (83, 1), (84, 1), (90, 1), (110, 1), (116, 1), (121, 1), (131, 2), (134, 1), (137, 1), (151, 1), (154, 1), (168, 4), (172, 1), (188, 1), (195, 3), (205, 1), (210, 1), (212, 1), (213, 1), (214, 1), (220, 1), (224, 1), (226, 1), (228, 1), (235, 1), (241, 2), (242, 1), (261, 1), (279, 3), (294, 1), (308, 1), (331, 2), (364, 1), (402, 2), (403, 2), (410, 1), (412, 1), (454, 1), (456, 1), (473, 1), (474, 1), (491, 1), (493, 1), (513, 3), (557, 1), (613, 1), (665, 1), (675, 1), (680, 1), (685, 1), (691, 2), (692, 1), (718, 1), (745, 1), (748, 1), (776, 1), (795, 1), (803, 2), (818, 1), (827, 1), (898, 1), (940, 1), (956, 1), (970, 1), (995, 1), (1002, 1)], [(1, 1), (11, 2), (12, 1), (22, 1), (23, 1), (28, 1), (30, 2), (41, 2), (51, 2), (52, 1), (65, 1), (68, 1), (87, 1), (90, 1), (106, 1), (113, 2), (115, 1), (121, 1), (134, 2), (148, 1), (152, 3), (154, 2), (155, 1), (159, 1), (168, 2), (169, 1), (172, 2), (176, 2), (195, 1), (202, 2), (210, 3), (219, 1), (241, 1), (262, 1), (265, 1), (272, 2), (302, 1), (349, 1), (390, 2), (395, 2), (402, 1), (403, 2), (465, 2), (469, 1), (474, 1), (513, 1), (557, 1), (579, 2), (584, 1), (587, 2), (598, 1), (614, 1), (622, 1), (628, 1), (669, 1), (707, 1), (733, 1), (757, 1), (828, 1), (844, 1), (847, 1), (882, 1), (888, 1), (942, 1), (966, 1), (969, 1), (973, 1), (1003, 1), (1005, 2)]]
Maximizar Likehood positiva = Minimizar negative Likehood
from sklearn import metrics
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
import time
import pandas as pd
import itertools
start_time = time.time()
#lista= [[0.01,0.01],[0.01,0.31],[0.01,0.61]]
lista= [[0.01,0.01],[0.01,0.31],[0.01,0.61],[0.01,0.91], \
[0.31,0.01],[0.31,0.31],[0.31,0.61],[0.31,0.91],
[0.61,0.01],[0.61,0.31],[0.61,0.61],[0.61,0.91],
[0.91,0.01],[0.91,0.31],[0.91,0.61],[0.91,0.91]]
lista2=[]
for x,y in lista:
#print('alpha=',x,'gamma=',y)
locals()["Modelo_" + 'alpha=' +str(x)+ 'gamma=', str(y)] =HdpModel(corpus=corpus, id2word=dictionary, random_state= 30,alpha=x, gamma=y, K=15,T=5)
locals()['Medida Likehood ='] =locals()["Modelo_" + 'alpha=' +str(x)+ 'gamma=', str(y)].evaluate_test_corpus(corpus)
#print('Likehood =',locals()['Medida Likehood ='])
list_models=[x,y,locals()['Medida Likehood =']]
lista2.append(list_models)
#print('----------------------------')
print("--- Tiempo todos posibles modelos fue de : %s seconds ---" % (time.time() - start_time))
lista2.sort(key=lambda x: x[2:],reverse=True)
df = pd.DataFrame(lista2,columns=['alpha','beta','Total_likehood'])
df.head(20)
--- Tiempo todos posibles modelos fue de : 2.4699997901916504 seconds ---
alpha | beta | Total_likehood | |
---|---|---|---|
0 | 0.91 | 0.91 | -46994.505349 |
1 | 0.91 | 0.31 | -46994.696957 |
2 | 0.91 | 0.61 | -46994.963242 |
3 | 0.91 | 0.01 | -46998.507175 |
4 | 0.61 | 0.91 | -47032.941729 |
5 | 0.61 | 0.61 | -47034.995199 |
6 | 0.61 | 0.31 | -47037.800363 |
7 | 0.61 | 0.01 | -47041.573589 |
8 | 0.31 | 0.91 | -47118.901537 |
9 | 0.31 | 0.61 | -47121.049738 |
10 | 0.31 | 0.31 | -47123.878034 |
11 | 0.31 | 0.01 | -47127.577438 |
12 | 0.01 | 0.91 | -47671.437246 |
13 | 0.01 | 0.61 | -47673.824472 |
14 | 0.01 | 0.31 | -47676.890096 |
15 | 0.01 | 0.01 | -47680.815434 |
from sklearn import metrics
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
import time
import pandas as pd
import itertools
start_time = time.time()
lista= [[5,15],[5,10],(15,150)]
lista2=[]
for x,y in lista:
#print('alpha=',x,'gamma=',y)
locals()["Modelo_" + 'alpha=' +str(x)+ 'gamma='+ str(y)] =HdpModel(corpus=corpus, id2word=dictionary, random_state= 30,alpha=1, gamma=1, K=x,T=y)
locals()['Medida Cv' + 'alpha=' +str(x)+ 'gamma='+ str(y)] =CoherenceModel(model=locals()["Modelo_" + 'alpha=' +str(x)+ 'gamma='+ str(y)], texts=data['Abstract_sin_stopwords2'].to_list(), dictionary=dictionary, coherence='c_v')
locals()['CV' + 'alpha=' + str(x)+ 'gamma=' + str(y)]=locals()['Medida Cv' + 'alpha=' +str(x)+ 'gamma='+ str(y)].get_coherence()
list_models=[x,y,locals()['CV' + 'alpha=' + str(x)+ 'gamma=' + str(y)]]
lista2.append(list_models)
#print('----------------------------')
print("--- Tiempo todos posibles modelos fue de : %s seconds ---" % (time.time() - start_time))
lista2.sort(key=lambda x: x[2:],reverse=True)
df = pd.DataFrame(lista2,columns=['K','T','C_v'])
df.head(20)
--- Tiempo todos posibles modelos fue de : 21.768982410430908 seconds ---
K | T | C_v | |
---|---|---|---|
0 | 5 | 15 | 0.650789 |
1 | 15 | 150 | 0.650357 |
2 | 5 | 10 | 0.645831 |
import seaborn as sns
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.set_style("whitegrid")
coherences = [0.366477, 0.311334, 0.309647]
n = len(coherences)
x = [r'$K=15, T=150$', r'$K=5, T=10$', r'$K=5, T=15$']
sns.barplot(x, coherences)
plt.ylabel('cv scores')
plt.title('Comparación de modelos'+ r' $\alpha=1, \beta=1$')
C:\Users\suzak\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
Text(0.5, 1.0, 'Comparación de modelos $\\alpha=1, \\beta=1$')
from sklearn import metrics
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
import time
import pandas as pd
import itertools
start_time = time.time()
#lista= [[5,15],[5,10]]
lista= [[0.01,0.01],[0.01,0.31],[0.01,0.61],[0.01,0.91], \
[0.31,0.01],[0.31,0.31],[0.31,0.61],[0.31,0.91],
[0.61,0.01],[0.61,0.31],[0.61,0.61],[0.61,0.91],
[0.91,0.01],[0.91,0.31],[0.91,0.61],[0.91,0.91]]
lista2=[]
for x,y in lista:
#print('alpha=',x,'gamma=',y)
locals()["Modelo_" + 'alpha=' +str(x)+ 'gamma='+ str(y)] =HdpModel(corpus=corpus, id2word=dictionary, random_state= 30,alpha=x, gamma=y, K=5,T=15)
locals()['Medida Cv' + 'alpha=' +str(x)+ 'gamma='+ str(y)] =CoherenceModel(model=locals()["Modelo_" + 'alpha=' +str(x)+ 'gamma='+ str(y)], texts=data['Abstract_sin_stopwords2'].to_list(), dictionary=dictionary, coherence='c_v')
locals()['CV' + 'alpha=' + str(x)+ 'gamma=' + str(y)]=locals()['Medida Cv' + 'alpha=' +str(x)+ 'gamma='+ str(y)].get_coherence()
#print('Likehood =',locals()['Medida Likehood ='])
#print('Cv =',locals()['CV' + 'alpha=' + str(x)+ 'gamma=' + str(y)])
list_models=[x,y,locals()['CV' + 'alpha=' + str(x)+ 'gamma=' + str(y)]]
lista2.append(list_models)
#print('----------------------------')
print("--- Tiempo todos posibles modelos fue de : %s seconds ---" % (time.time() - start_time))
lista2.sort(key=lambda x: x[2:],reverse=True)
df = pd.DataFrame(lista2,columns=['alpha','beta','C_v'])
df.head(20)
--- Tiempo todos posibles modelos fue de : 38.75088357925415 seconds ---
alpha | beta | C_v | |
---|---|---|---|
0 | 0.01 | 0.01 | 0.650789 |
1 | 0.01 | 0.31 | 0.650789 |
2 | 0.01 | 0.61 | 0.650789 |
3 | 0.01 | 0.91 | 0.650789 |
4 | 0.31 | 0.01 | 0.650789 |
5 | 0.31 | 0.31 | 0.650789 |
6 | 0.31 | 0.61 | 0.650789 |
7 | 0.31 | 0.91 | 0.650789 |
8 | 0.61 | 0.01 | 0.650789 |
9 | 0.61 | 0.31 | 0.650789 |
10 | 0.61 | 0.61 | 0.650789 |
11 | 0.61 | 0.91 | 0.650789 |
12 | 0.91 | 0.01 | 0.650789 |
13 | 0.91 | 0.31 | 0.650789 |
14 | 0.91 | 0.61 | 0.650789 |
15 | 0.91 | 0.91 | 0.650789 |
import seaborn as sns
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.set_style("whitegrid")
coherences = [0.315997, 0.315887, 0.312725]
n = len(coherences)
x = [r'$\alpha=0.01, \beta=0.91$', r'$\alpha=0.91, \beta=0.61$', r'$\alpha=0.61, \beta=0.91$']
sns.barplot(x, coherences)
plt.ylabel('cv scores')
plt.title('Comparación de modelos'+r' $K=5, T=15$')
C:\Users\suzak\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
Text(0.5, 1.0, 'Comparación de modelos $K=5, T=15$')
from sklearn import metrics
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
import time
import pandas as pd
import itertools
start_time = time.time()
#lista= [[5,15],[5,10]]
lista= [[0.01,0.01],[0.01,0.31],[0.01,0.61],[0.01,0.91], \
[0.31,0.01],[0.31,0.31],[0.31,0.61],[0.31,0.91],
[0.61,0.01],[0.61,0.31],[0.61,0.61],[0.61,0.91],
[0.91,0.01],[0.91,0.31],[0.91,0.61],[0.91,0.91]]
lista2=[]
for x,y in lista:
#print('alpha=',x,'gamma=',y)
locals()["Modelo_" + 'alpha=' +str(x)+ 'gamma='+ str(y)] =HdpModel(corpus=corpus, id2word=dictionary, random_state= 30,alpha=x, gamma=y, K=5,T=10)
locals()['Medida Cv' + 'alpha=' +str(x)+ 'gamma='+ str(y)] =CoherenceModel(model=locals()["Modelo_" + 'alpha=' +str(x)+ 'gamma='+ str(y)], texts=data['Abstract_sin_stopwords2'].to_list(), dictionary=dictionary, coherence='c_v')
locals()['CV' + 'alpha=' + str(x)+ 'gamma=' + str(y)]=locals()['Medida Cv' + 'alpha=' +str(x)+ 'gamma='+ str(y)].get_coherence()
#print('Likehood =',locals()['Medida Likehood ='])
#print('Cv =',locals()['CV' + 'alpha=' + str(x)+ 'gamma=' + str(y)])
list_models=[x,y,locals()['CV' + 'alpha=' + str(x)+ 'gamma=' + str(y)]]
lista2.append(list_models)
#print('----------------------------')
print("--- Tiempo todos posibles modelos fue de : %s seconds ---" % (time.time() - start_time))
lista2.sort(key=lambda x: x[2:],reverse=True)
df = pd.DataFrame(lista2,columns=['alpha','beta','C_v'])
df.head(20)
--- Tiempo todos posibles modelos fue de : 35.484116077423096 seconds ---
alpha | beta | C_v | |
---|---|---|---|
0 | 0.01 | 0.01 | 0.645831 |
1 | 0.01 | 0.31 | 0.645831 |
2 | 0.01 | 0.61 | 0.645831 |
3 | 0.01 | 0.91 | 0.645831 |
4 | 0.31 | 0.01 | 0.645831 |
5 | 0.31 | 0.31 | 0.645831 |
6 | 0.31 | 0.61 | 0.645831 |
7 | 0.31 | 0.91 | 0.645831 |
8 | 0.61 | 0.01 | 0.645831 |
9 | 0.61 | 0.31 | 0.645831 |
10 | 0.61 | 0.61 | 0.645831 |
11 | 0.61 | 0.91 | 0.645831 |
12 | 0.91 | 0.01 | 0.645831 |
13 | 0.91 | 0.31 | 0.645831 |
14 | 0.91 | 0.61 | 0.645831 |
15 | 0.91 | 0.91 | 0.645831 |
import seaborn as sns
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.set_style("whitegrid")
coherences = [0.332572, 0.318770, 0.314609]
n = len(coherences)
x = [r'$\alpha=0.01, \beta=0.01$', r'$\alpha=0.91, \beta=0.91$', r'$\alpha=0.61, \beta=0.01$']
sns.barplot(x, coherences)
plt.ylabel('cv scores')
plt.title('Comparación de modelos'+ r' $K=5, T=10$')
C:\Users\suzak\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
Text(0.5, 1.0, 'Comparación de modelos $K=5, T=10$')
from sklearn import metrics
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
import time
import pandas as pd
import itertools
start_time = time.time()
#lista= [[5,15],[5,10]]
lista= [[0.01,0.01],[0.01,0.31],[0.01,0.61],[0.01,0.91], \
[0.31,0.01],[0.31,0.31],[0.31,0.61],[0.31,0.91],
[0.61,0.01],[0.61,0.31],[0.61,0.61],[0.61,0.91],
[0.91,0.01],[0.91,0.31],[0.91,0.61],[0.91,0.91]]
lista2=[]
for x,y in lista:
#print('alpha=',x,'gamma=',y)
locals()["Modelo_" + 'alpha=' +str(x)+ 'gamma='+ str(y)] =HdpModel(corpus=corpus, id2word=dictionary, random_state= 30,alpha=x, gamma=y, K=15,T=5)
locals()['Medida Cv' + 'alpha=' +str(x)+ 'gamma='+ str(y)] =CoherenceModel(model=locals()["Modelo_" + 'alpha=' +str(x)+ 'gamma='+ str(y)], texts=data['Abstract_sin_stopwords2'].to_list(), dictionary=dictionary, coherence='c_v')
locals()['CV' + 'alpha=' + str(x)+ 'gamma=' + str(y)]=locals()['Medida Cv' + 'alpha=' +str(x)+ 'gamma='+ str(y)].get_coherence()
#print('Likehood =',locals()['Medida Likehood ='])
#print('Cv =',locals()['CV' + 'alpha=' + str(x)+ 'gamma=' + str(y)])
list_models=[x,y,locals()['CV' + 'alpha=' + str(x)+ 'gamma=' + str(y)]]
lista2.append(list_models)
#print('----------------------------')
print("--- Tiempo todos posibles modelos fue de : %s seconds ---" % (time.time() - start_time))
lista2.sort(key=lambda x: x[2:],reverse=True)
df = pd.DataFrame(lista2,columns=['alpha','beta','C_v'])
df.head(20)
--- Tiempo todos posibles modelos fue de : 33.93630814552307 seconds ---
alpha | beta | C_v | |
---|---|---|---|
0 | 0.01 | 0.01 | 0.655882 |
1 | 0.01 | 0.31 | 0.655882 |
2 | 0.01 | 0.61 | 0.655882 |
3 | 0.01 | 0.91 | 0.655882 |
4 | 0.31 | 0.01 | 0.655882 |
5 | 0.31 | 0.31 | 0.655882 |
6 | 0.31 | 0.61 | 0.655882 |
7 | 0.31 | 0.91 | 0.655882 |
8 | 0.61 | 0.01 | 0.655882 |
9 | 0.61 | 0.31 | 0.655882 |
10 | 0.61 | 0.61 | 0.655882 |
11 | 0.61 | 0.91 | 0.655882 |
12 | 0.91 | 0.01 | 0.655882 |
13 | 0.91 | 0.31 | 0.655882 |
14 | 0.91 | 0.61 | 0.655882 |
15 | 0.91 | 0.91 | 0.655882 |
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D # noqa: F401 unused import
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import matplotlib.cm as cm
# setup the figure and axes
fig = plt.figure(figsize=(10, 10))
ax1 = fig.add_subplot(111, projection='3d')
#ax2 = fig.add_subplot(122, projection='3d')
# fake data
x = [0.01,0.31,0.91,0.61,0.01,0.91,0.31,0.61,0.31,0.31,0.91,0.61,0.91,0.61,0.01,0.01]
y = [0.61,0.01,0.61,0.31,0.01,0.91,0.91,0.01,0.31,0.61,0.01,0.91,0.31,0.61,0.91,0.31]
z = [0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0] # z coordinates of each bar
dx = [0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1] # Width of each bar
dy = [0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1] # Depth of each bar
dz = [0.306621,0.306621,0.298844,0.298843,0.298843,0.298843,0.298842,0.298840,0.298837,0.298832,0.298832,0.298830,0.298830,0.298828,0.298828,0.298828] # Height of each bar
#_xx, _yy = np.meshgrid(_x, _y)
#x, y = _xx.ravel(), _yy.ravel()
cmap = cm.get_cmap('jet') # Get desired colormap
max_height = max(dz) # get range of colorbars
min_height = min(dz)
# scale each z to [0,1], and get their rgb values
rgba = [cmap(20*(k-min_height)/max_height) for k in dz]
ax1.bar3d(x, y, z, dx, dy, dz, color=rgba)
ax1.set_title('Diagrama 3D alpha-beta-Cv')
ax1.set_xlabel('Alpha')
ax1.set_ylabel('Beta')
ax1.set_zlabel('Cv')
#ax2.bar3d(x, y, bottom, width, depth, top, shade=False)
#ax2.set_title('Not Shaded')
plt.show()
import seaborn as sns
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.set_style("whitegrid")
coherences = [0.306621, 0.302518, 0.278656]
n = len(coherences)
x = [r'$\alpha=0.31, \beta=0.31$', r'$\alpha=0.91, \beta=0.61$', r'$\alpha=1, \beta=1$']
sns.barplot(x, coherences)
plt.ylabel('cv scores')
plt.title('Comparación de modelos')
C:\Users\suzak\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
Text(0.5, 1.0, 'Comparación de modelos')
Modelo 1: $\alpha$=1, $\gamma$=1
%%time
hdpmodel = HdpModel(corpus=corpus, id2word=dictionary, random_state= 30,alpha=1,gamma=1, K=5, T=15)
'''
,max_chunks=None, max_time=None,\
chunksize=256, kappa=1.0, tau=64.0, K=15, T=150, alpha=1, gamma=1, eta=0.01, scale=1.0,\
var_converge=0.0001, outputdir=None)
'''
# corpus = iterable list with the data
# id2word= diccionario con el input del corpus
# max_chunks limite superior de cuantos chunks a procesar
#max_time upper bound para que el modelo sea entrenado
# chunksize= numero de documentos en 1 chunk
# kappa= parametro de aprendizaje que actua como factor de decaimiento exponencial para no influenciar perdida de informacion
# tau= parametro de aprendizaje que pondera las iteraciones
# K = segundo nivel de truncamiento
# T= primer nivel de truncamiento
# alpha= segundo nivel de concentracion
# gamma= primer nivel de concentracion
# eta= Topic Dirichlet
# scale= peso de la informacion obtenida de lso chunks para calcular rhot
# var_converge= limite inferior para convergencia
# outputdir = almacena los topicos y opciones de directorio
# random_state= parametro de aleatoreidad
'''
tiempos de espera
50000 registros = 2 min 38 s
100000 registros = 4 min 26 s
200000 registros = 9 min 17 s
250000 registros = 10 min 52 s
Todos registors = 11 min 18 s
'''
Wall time: 54 ms
'\ntiempos de espera\n50000 registros = 2 min 38 s\n100000 registros = 4 min 26 s\n200000 registros = 9 min 17 s\n250000 registros = 10 min 52 s\nTodos registors = 11 min 18 s\n'
Función para mostrar los topicos del modelo
# Esta es mi funcion para despues de correr los modelos extraer los topicos
def display_topics(model, model_type="lda"):
for topic_idx, topic in enumerate(model.print_topics()):
print ("Topic %d:" % (topic_idx))
if model_type== "hdp":
print (" ".join(re.findall( r'\*(.[^\*-S]+).?', topic[1])), "\n")
else:
print (" ".join(re.findall( r'\"(.[^"]+).?', topic[1])), "\n")
# hdpmodel.show_topics()
display_topics(hdpmodel, model_type="hdp")
Topic 0: achieve remotely harvest reliance network beneficial energy income novel sensors Topic 1: developments times carry reference sustainable factor base total via expert Topic 2: scheme analysis new generated regarding namely alerts deep decision growth Topic 3: effort directly remarks freshwater especially actuating wide perform villages food Topic 4: taking exploitation ongoing responsibility highlights create adoption efficient due based Topic 5: ideal simulation work individuals objective sends recent mechanism evaluated method Topic 6: infrastructure depends entire focus profitable shared 3 wsn mqtt set Topic 7: gives large humidity provided pa produce automate wastage actual performs Topic 8: system iot art wireless vision thanks etc derived discussed Topic 9: uplink technical levels crucial account achieved exponential power generated lpwa Topic 10: sense resulted volumetric strategies growth humanity key communications pump enabling Topic 11: introduce two easy mind tem effective tank irrigation gradient prevailing Topic 12: addition many enhance proposes deliver tem showing successfully monitor plant Topic 13: describes smaller impoverished done integrate considering mobile named shown income Topic 14: insights applying novel greece yield status compared time supporting hence
Modelo 2: $\alpha$=0.31, $\gamma$=0.31
%%time
hdpmodel2 = HdpModel(corpus=corpus, id2word=dictionary, random_state= 30,alpha=0.31,gamma=0.31, K=5, T=15)
Wall time: 56 ms
# hdpmodel.show_topics()
display_topics(hdpmodel2, model_type="hdp")
Topic 0: achieve remotely harvest reliance network beneficial energy income novel sensors Topic 1: developments times carry reference sustainable factor base total via expert Topic 2: scheme analysis new generated regarding namely alerts deep decision growth Topic 3: effort directly remarks freshwater especially actuating wide perform villages food Topic 4: taking exploitation ongoing responsibility highlights create adoption efficient due based Topic 5: ideal simulation work individuals objective sends recent mechanism evaluated method Topic 6: infrastructure depends entire focus profitable shared 3 wsn mqtt set Topic 7: gives large humidity provided pa produce automate wastage actual performs Topic 8: system iot art wireless vision thanks etc derived discussed Topic 9: uplink technical levels crucial account achieved exponential power generated lpwa Topic 10: sense resulted volumetric strategies growth humanity key communications pump enabling Topic 11: introduce two easy mind tem effective tank irrigation gradient prevailing Topic 12: addition many enhance proposes deliver tem showing successfully monitor plant Topic 13: describes smaller impoverished done integrate considering mobile named shown income Topic 14: insights applying novel greece yield status compared time supporting hence
Modelo 3: $\alpha$=0.01, $\gamma$=1
%%time
hdpmodel3 = HdpModel(corpus=corpus, id2word=dictionary, random_state= 30,alpha=0.01,gamma=1, K=5, T=15)
Wall time: 53 ms
# hdpmodel.show_topics()
display_topics(hdpmodel3, model_type="hdp")
Topic 0: achieve remotely harvest reliance network beneficial energy income novel sensors Topic 1: developments times carry reference sustainable factor base total via expert Topic 2: scheme analysis new generated regarding namely alerts deep decision growth Topic 3: effort directly remarks freshwater especially actuating wide perform villages food Topic 4: taking exploitation ongoing responsibility highlights create adoption efficient due based Topic 5: ideal simulation work individuals objective sends recent mechanism evaluated method Topic 6: infrastructure depends entire focus profitable shared 3 wsn mqtt set Topic 7: gives large humidity provided pa produce automate wastage actual performs Topic 8: system iot art wireless vision thanks etc derived discussed Topic 9: uplink technical levels crucial account achieved exponential power generated lpwa Topic 10: sense resulted volumetric strategies growth humanity key communications pump enabling Topic 11: introduce two easy mind tem effective tank irrigation gradient prevailing Topic 12: addition many enhance proposes deliver tem showing successfully monitor plant Topic 13: describes smaller impoverished done integrate considering mobile named shown income Topic 14: insights applying novel greece yield status compared time supporting hence
Modelo 4: $\alpha$=1, $\gamma$=0.01
%%time
hdpmodel4 = HdpModel(corpus=corpus, id2word=dictionary, random_state= 30,alpha=1,gamma=0.01, K=5, T=15)
Wall time: 59 ms
# hdpmodel.show_topics()
display_topics(hdpmodel4, model_type="hdp")
Topic 0: achieve remotely harvest reliance network beneficial energy income novel sensors Topic 1: developments times carry reference sustainable factor base total via expert Topic 2: scheme analysis new generated regarding namely alerts deep decision growth Topic 3: effort directly remarks freshwater especially actuating wide perform villages food Topic 4: taking exploitation ongoing responsibility highlights create adoption efficient due based Topic 5: ideal simulation work individuals objective sends recent mechanism evaluated method Topic 6: infrastructure depends entire focus profitable shared 3 wsn mqtt set Topic 7: gives large humidity provided pa produce automate wastage actual performs Topic 8: system iot art wireless vision thanks etc derived discussed Topic 9: uplink technical levels crucial account achieved exponential power generated lpwa Topic 10: sense resulted volumetric strategies growth humanity key communications pump enabling Topic 11: introduce two easy mind tem effective tank irrigation gradient prevailing Topic 12: addition many enhance proposes deliver tem showing successfully monitor plant Topic 13: describes smaller impoverished done integrate considering mobile named shown income Topic 14: insights applying novel greece yield status compared time supporting hence
def evaluate_graph(dictionary, corpus, texts, limit, model):
"""
Funcion para mostrar el numero de topicos - LDA graph usando c_v coherence
Parametros:
----------
dictionary : Gensim diccionario
corpus : Gensim corpus
limit : topic limit
Que devuelve:
-------
lm_list : Lista de LDA topic models
c_v : Coherence values correspondientes al LDA model con los respectivos numero de topicos
"""
c_v = []
lm_list = []
for num_topics in range(1, limit):
if model == 'lsi':
lm = LsiModel(corpus=corpus, num_topics=num_topics, id2word=dictionary)
else:
lm = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary)
lm_list.append(lm)
cm = CoherenceModel(model=lm, texts=texts, dictionary=dictionary, coherence='c_v')
c_v.append(cm.get_coherence())
# Show graph
x = range(1, limit)
plt.plot(x, c_v)
plt.xlabel("Número de tópicos")
plt.ylabel("Score de Coherencia")
#plt.legend(("c_v"), loc='best')
plt.show()
return lm_list, c_v
%%time
lmlist_lsi, c_v = evaluate_graph(dictionary=dictionary, corpus=corpus, texts=data['Abstract_sin_stopwords2'].to_list(), limit=21, model= "lsi")
'''
tiempo 50000 registros= 2 min 32 s
tiempo 100000 registros = 5min 5 sec
tiempo 200000 registros = 16min 25 sec
tiempo 250000 registros = 25min 41 sec
'''
Wall time: 41.7 s
'\ntiempo 50000 registros= 2 min 32 s\ntiempo 100000 registros = 5min 5 sec\ntiempo 200000 registros = 16min 25 sec\ntiempo 250000 registros = 25min 41 sec\n'
# Tambien conocido como el LSA model
#%%time
lsimodel = LsiModel(corpus=corpus, num_topics=9, id2word=dictionary)
'''
tiempo ejecucion 50000 registros 5.1 sec
tiempo ejecucion 100000 registros 8.91 sec
tiempo ejecucion 200000 registros 21.9 sec
tiempo ejecucion 250000 registros 33.0 sec
todos registros 33.0 sec
'''
'\ntiempo ejecucion 50000 registros 5.1 sec\ntiempo ejecucion 100000 registros 8.91 sec\ntiempo ejecucion 200000 registros 21.9 sec\ntiempo ejecucion 250000 registros 33.0 sec\ntodos registros 33.0 sec\n'
display_topics(lsimodel) # Showing the topics
Topic 0: irrigation water system iot data management farmers soil agriculture crop Topic 1: irrigation agriculture iot water irrigweb moisture soil management sensor system Topic 2: water iot agriculture moisture agricultural data farmers well management crop Topic 3: iot data soil water moisture crop sensor irrigweb temperature method Topic 4: iot data agriculture would agricultural farmers system india also time Topic 5: agriculture system irrigweb systems soil information irrigation field time wisa Topic 6: system agriculture data farmers proposed irrigweb smart use automatic iot Topic 7: systems farming soil data model use different would sensors well Topic 8: system iot data model management farmers development farmer moisture water
data.shape
(68, 15)
%%time
lmlist, c_v = evaluate_graph(dictionary=dictionary, corpus=corpus, texts=data['Abstract_sin_stopwords2'].to_list(), limit=21, model= "lda")
'''
50000 registros = 8 min 28 sec
100000 registros = 15 min 25 s
200000 registros = 30 min 29 s
250000 registros = 35 min 01 s
todos registros= =
'''
Wall time: 53 s
'\n50000 registros = 8 min 28 sec\n100000 registros = 15 min 25 s\n200000 registros = 30 min 29 s\n250000 registros = 35 min 01 s\ntodos registros= =\n'
from sklearn import metrics
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
import time
import pandas as pd
import itertools
start_time = time.time()
#lista= [[0.01,0.01],[0.91,0.31]]
lista= [[0.01,0.01],[0.01,0.31],[0.01,0.61],[0.01,0.91], \
[0.31,0.01],[0.31,0.31],[0.31,0.61],[0.31,0.91],
[0.61,0.01],[0.61,0.31],[0.61,0.61],[0.61,0.91],
[0.91,0.01],[0.91,0.31],[0.91,0.61],[0.91,0.91]]
lista2=[]
for x,y in lista:
#print('alpha=',x,'gamma=',y)
locals()["Modelo_" + 'alpha=' +str(x)+ 'gamma='+ str(y)] =LdaModel(corpus=corpus, id2word=dictionary, alpha=x, eta=y, num_topics=11)
locals()['Medida Cv' + 'alpha=' +str(x)+ 'gamma='+ str(y)] =CoherenceModel(model=locals()["Modelo_" + 'alpha=' +str(x)+ 'gamma='+ str(y)], texts=data['Abstract_sin_stopwords2'].to_list(), dictionary=dictionary, coherence='c_v')
locals()['CV' + 'alpha=' + str(x)+ 'gamma=' + str(y)]=locals()['Medida Cv' + 'alpha=' +str(x)+ 'gamma='+ str(y)].get_coherence()
#print('Likehood =',locals()['Medida Likehood ='])
#print('Cv =',locals()['CV' + 'alpha=' + str(x)+ 'gamma=' + str(y)])
list_models=[x,y,locals()['CV' + 'alpha=' + str(x)+ 'gamma=' + str(y)]]
lista2.append(list_models)
#print('----------------------------')
print("--- Tiempo todos posibles modelos fue de : %s seconds ---" % (time.time() - start_time))
lista2.sort(key=lambda x: x[2:],reverse=True)
df = pd.DataFrame(lista2,columns=['alpha','eta','C_v'])
df.head(20)
--- Tiempo todos posibles modelos fue de : 36.48182201385498 seconds ---
alpha | eta | C_v | |
---|---|---|---|
0 | 0.91 | 0.31 | 0.353385 |
1 | 0.61 | 0.31 | 0.352509 |
2 | 0.31 | 0.91 | 0.349906 |
3 | 0.91 | 0.01 | 0.349874 |
4 | 0.61 | 0.91 | 0.346555 |
5 | 0.91 | 0.61 | 0.346364 |
6 | 0.01 | 0.91 | 0.341860 |
7 | 0.61 | 0.61 | 0.341469 |
8 | 0.91 | 0.91 | 0.340781 |
9 | 0.01 | 0.31 | 0.338051 |
10 | 0.61 | 0.01 | 0.337559 |
11 | 0.31 | 0.01 | 0.337258 |
12 | 0.31 | 0.61 | 0.334239 |
13 | 0.01 | 0.61 | 0.331950 |
14 | 0.01 | 0.01 | 0.326441 |
15 | 0.31 | 0.31 | 0.319943 |
import matplotlib.pyplot as plt
import matplotlib.cm as cm
# setup the figure and axes
fig = plt.figure(figsize=(10, 10))
ax1 = fig.add_subplot(111, projection='3d')
#ax2 = fig.add_subplot(122, projection='3d')
# fake data
x = [0.01,0.31,0.91,0.61,0.01,0.91,0.31,0.61,0.31,0.31,0.91,0.61,0.91,0.61,0.01,0.01]
y = [0.61,0.01,0.61,0.31,0.01,0.91,0.91,0.01,0.31,0.61,0.01,0.91,0.31,0.61,0.91,0.31]
z = [0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0] # z coordinates of each bar
dx = [0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1] # Width of each bar
dy = [0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1] # Depth of each bar
dz = [0.638940,0.638136,0.636483,0.634196,0.634172,0.628971,0.628425,0.626535,0.623586,0.623017,0.622600,0.620396,0.616200,0.604747,0.588063,0.534966] # Height of each bar
#_xx, _yy = np.meshgrid(_x, _y)
#x, y = _xx.ravel(), _yy.ravel()
cmap = cm.get_cmap('jet') # Get desired colormap
max_height = max(dz) # get range of colorbars
min_height = min(dz)
# scale each z to [0,1], and get their rgb values
rgba = [cmap(5*(k-min_height)/max_height) for k in dz]
ax1.bar3d(x, y, z, dx, dy, dz, color=rgba)
ax1.set_title('Diagrama 3D alpha-beta-Cv')
ax1.set_xlabel('Alpha')
ax1.set_ylabel('Beta')
ax1.set_zlabel('Cv')
#ax2.bar3d(x, y, bottom, width, depth, top, shade=False)
#ax2.set_title('Not Shaded')
plt.show()
%%time
ldamodel = LdaModel(corpus=corpus, num_topics=9, id2word=dictionary, alpha=0.01, eta=0.61)
'''
100000 registros 37.9 s
200000 registros 1 min 16 sec
250000 registros 1 min 27 sec
todos registors = 1 min 31 sec
'''
Wall time: 122 ms
'\n100000 registros 37.9 s\n200000 registros 1 min 16 sec\n250000 registros 1 min 27 sec\ntodos registors = 1 min 31 sec\n'
display_topics(ldamodel)
Topic 0: irrigation water iot soil system farmers data crop systems moisture Topic 1: irrigation iot water data agriculture based farmers would farmer system Topic 2: system iot agriculture crop data management sensor using agricultural technologies Topic 3: system irrigation iot agriculture proposed based water agricultural farmers monitoring Topic 4: irrigation water agriculture iot system data management farmer pa systems Topic 5: irrigation water system data systems management soil iot sensor using Topic 6: iot system irrigation farmers agriculture water data agricultural farming smart Topic 7: system water irrigation agriculture iot rural farmers data proposed agricultural Topic 8: soil iot data water irrigation crop system sensors using agriculture
from gensim import corpora, models
def evaluate_graph(dictionary, corpus, texts, limit, model):
"""
Funcion para mostrar el numero de topicos - LDA graph usando c_v coherence
Parametros:
----------
dictionary : Gensim diccionario
corpus : Gensim corpus
limit : topic limit
Que devuelve:
-------
lm_list : Lista de LDA topic models
c_v : Coherence values correspondientes al LDA model con los respectivos numero de topicos
"""
c_v = []
lm_list = []
for num_topics in range(1, limit):
if model == 'lsi':
lm = LsiModel(corpus=corpus, num_topics=num_topics, id2word=dictionary)
elif model == 'ldamulticore':
lm = models.LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=dictionary)
else:
lm = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary)
lm_list.append(lm)
cm = CoherenceModel(model=lm, texts=texts, dictionary=dictionary, coherence='c_v')
c_v.append(cm.get_coherence())
# Show graph
x = range(1, limit)
plt.plot(x, c_v)
plt.xlabel("Número de tópicos")
plt.ylabel("Score de Coherencia")
#plt.legend(("c_v"), loc='best')
plt.show()
return lm_list, c_v
%%time
lmlist_multicore, c_v = evaluate_graph(dictionary=dictionary, corpus=corpus, texts=data['Abstract_sin_stopwords2'].to_list(), limit=21, model= "ldamulticore")
'''
50000 registros = 8 min 28 sec
100000 registros = 15 min 21 s
200000 registros = 30 min 35 s
250000 registros = 35 min 25 s
'''
Wall time: 2min 28s
'\n50000 registros = 8 min 28 sec\n100000 registros = 15 min 21 s\n200000 registros = 30 min 35 s\n250000 registros = 35 min 25 s\n'
from sklearn import metrics
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
from gensim import corpora, models
import time
import pandas as pd
import itertools
start_time = time.time()
#lista= [[0.01,0.01]]
lista= [[0.01,0.01],[0.01,0.31],[0.01,0.61],[0.01,0.91], \
[0.31,0.01],[0.31,0.31],[0.31,0.61],[0.31,0.91],
[0.61,0.01],[0.61,0.31],[0.61,0.61],[0.61,0.91],
[0.91,0.01],[0.91,0.31],[0.91,0.61],[0.91,0.91]]
lista2=[]
for x,y in lista:
#print('alpha=',x,'gamma=',y)
locals()["Modelo_" + 'alpha=' +str(x)+ 'gamma='+ str(y)] =models.LdaMulticore(corpus=corpus, id2word=dictionary, alpha=x, eta=y, num_topics=11)
locals()['Medida Cv' + 'alpha=' +str(x)+ 'gamma='+ str(y)] =CoherenceModel(model=locals()["Modelo_" + 'alpha=' +str(x)+ 'gamma='+ str(y)], texts=data['Abstract_sin_stopwords2'].to_list(), dictionary=dictionary, coherence='c_v')
locals()['CV' + 'alpha=' + str(x)+ 'gamma=' + str(y)]=locals()['Medida Cv' + 'alpha=' +str(x)+ 'gamma='+ str(y)].get_coherence()
#print('Likehood =',locals()['Medida Likehood ='])
#print('Cv =',locals()['CV' + 'alpha=' + str(x)+ 'gamma=' + str(y)])
list_models=[x,y,locals()['CV' + 'alpha=' + str(x)+ 'gamma=' + str(y)]]
lista2.append(list_models)
#print('----------------------------')
print("--- Tiempo todos posibles modelos fue de : %s seconds ---" % (time.time() - start_time))
lista2.sort(key=lambda x: x[2:],reverse=True)
df = pd.DataFrame(lista2,columns=['alpha','eta','C_v'])
df.head(20)
--- Tiempo todos posibles modelos fue de : 152.1322464942932 seconds ---
alpha | eta | C_v | |
---|---|---|---|
0 | 0.91 | 0.91 | 0.352706 |
1 | 0.01 | 0.91 | 0.350090 |
2 | 0.91 | 0.61 | 0.348013 |
3 | 0.91 | 0.01 | 0.347172 |
4 | 0.31 | 0.61 | 0.343749 |
5 | 0.61 | 0.91 | 0.342270 |
6 | 0.31 | 0.01 | 0.338896 |
7 | 0.91 | 0.31 | 0.338753 |
8 | 0.01 | 0.31 | 0.337064 |
9 | 0.61 | 0.61 | 0.336736 |
10 | 0.61 | 0.31 | 0.335897 |
11 | 0.31 | 0.91 | 0.335522 |
12 | 0.31 | 0.31 | 0.327734 |
13 | 0.61 | 0.01 | 0.322204 |
14 | 0.01 | 0.61 | 0.316652 |
15 | 0.01 | 0.01 | 0.310992 |
import numpy as np
import matplotlib.pyplot as plt
# This import registers the 3D projection, but is otherwise unused.
from mpl_toolkits.mplot3d import Axes3D # noqa: F401 unused import
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import matplotlib.cm as cm
# setup the figure and axes
fig = plt.figure(figsize=(10, 10))
ax1 = fig.add_subplot(111, projection='3d')
#ax2 = fig.add_subplot(122, projection='3d')
# fake data
x = [0.01,0.31,0.91,0.61,0.01,0.91,0.31,0.61,0.31,0.31,0.91,0.61,0.91,0.61,0.01,0.01]
y = [0.61,0.01,0.61,0.31,0.01,0.91,0.91,0.01,0.31,0.61,0.01,0.91,0.31,0.61,0.91,0.31]
z = [0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0] # z coordinates of each bar
dx = [0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1] # Width of each bar
dy = [0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1] # Depth of each bar
dz = [0.6327898,0.632367,0.632254,0.632196,0.632087,0.630765,0.630654,0.626171,0.623498,0.622190,0.620138,0.617379,0.615242,0.613093,0.609715,0.593894] # Height of each bar
#_xx, _yy = np.meshgrid(_x, _y)
#x, y = _xx.ravel(), _yy.ravel()
cmap = cm.get_cmap('jet') # Get desired colormap
max_height = max(dz) # get range of colorbars
min_height = min(dz)
# scale each z to [0,1], and get their rgb values
rgba = [cmap(13*(k-min_height)/max_height) for k in dz]
ax1.bar3d(x, y, z, dx, dy, dz, color=rgba)
ax1.set_title('Diagrama 3D alpha-beta-Cv')
ax1.set_xlabel('Alpha')
ax1.set_ylabel('Beta')
ax1.set_zlabel('Cv')
#ax2.bar3d(x, y, bottom, width, depth, top, shade=False)
#ax2.set_title('Not Shaded')
plt.show()
%%time
from gensim import corpora, models
ldamulticoremodel = models.LdaMulticore(corpus=corpus, num_topics=13, id2word=dictionary)
'''
100000 registros = 1 min 22 sec
200000 registros = 2 min 37 sec
250000 registros = 3 min 2 sec
todos registros =
'''
Wall time: 7.28 s
'\n100000 registros = 1 min 22 sec\n200000 registros = 2 min 37 sec\n250000 registros = 3 min 2 sec\ntodos registros =\n'
display_topics(ldamulticoremodel)
Topic 0: soil iot water moisture system data platform systems sensors information Topic 1: irrigation water system farmers agriculture iot agricultural level crop rural Topic 2: iot water agriculture irrigation soil use farming network rural monitoring Topic 3: iot system farming farmers irrigation water agriculture soil proposed management Topic 4: water irrigation soil moisture field system data temperature using based Topic 5: water irrigation iot system farmers agriculture systems smart data technologies Topic 6: irrigation agriculture system farmers crop support market use using also Topic 7: irrigation iot data system management water crop farmers open using Topic 8: irrigation system data water management soil iot crop sensor based Topic 9: agriculture system irrigation iot water farmers data proposed model rural Topic 10: irrigation agriculture water system iot systems data smart farmers farming Topic 11: water irrigation system sensor agriculture field using soil development proposed Topic 12: iot water irrigation agriculture system proposed based crop soil farming
Comparación con la coherencia de cada modelo
ldamodel = lmlist[9]
ldamulticore=lmlist_multicore[9]
lsimodel = lmlist_lsi[9]
lsitopics = [[word for word, prob in topic] for topicid, topic in lsimodel.show_topics(formatted=False)]
hdptopics = [[word for word, prob in topic] for topicid, topic in hdpmodel2.show_topics(formatted=False)]
ldatopics = [[word for word, prob in topic] for topicid, topic in ldamodel.show_topics(formatted=False)]
ldamulticoretopics = [[word for word, prob in topic] for topicid, topic in ldamulticoremodel.show_topics(formatted=False)]
%%time
lsi_coherence = CoherenceModel(topics=lsitopics[:10], texts=data['Abstract_sin_stopwords2'].to_list(), dictionary=dictionary, window_size=10).get_coherence()
hdp_coherence = CoherenceModel(topics=hdptopics[:10], texts=data['Abstract_sin_stopwords2'].to_list(), dictionary=dictionary, window_size=10).get_coherence()
lda_coherence = CoherenceModel(topics=ldatopics, texts=data['Abstract_sin_stopwords2'].to_list(), dictionary=dictionary, window_size=10).get_coherence()
ldamulticore_coherence = CoherenceModel(topics=ldamulticoretopics, texts=data['Abstract_sin_stopwords2'].to_list(), dictionary=dictionary, window_size=10).get_coherence()
Wall time: 7.69 s
import seaborn as sns
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.set_style("whitegrid")
coherences = [lsi_coherence, hdp_coherence, lda_coherence, ldamulticore_coherence]
n = len(coherences)
x = ['lsi_coherence','hdp_coherence', 'lda_coherence','ldaMC_coherence']
sns.barplot(x, coherences)
plt.ylabel('cv scores')
plt.title('Comparación de modelos')
C:\Users\suzak\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
Text(0.5, 1.0, 'Comparación de modelos')
En este caso el modelo hdp_coherence con cerca de 9 tópicos es el que presenta la mayor coherencia.
display_topics(ldamodel)
Topic 0: irrigation water agriculture system iot data soil proposed monitoring using Topic 1: water system irrigation iot systems using based parameters weather agricultural Topic 2: water irrigation system iot agriculture data soil crop proposed using Topic 3: irrigation system iot water data management crop farmers soil agriculture Topic 4: irrigation iot agriculture would network farming farm water farmers management Topic 5: system irrigation soil agriculture data water proposed field iot information Topic 6: iot irrigation water system data smart agriculture systems based crop Topic 7: agriculture water iot irrigation system based technology smart farming farmers Topic 8: water system rural iot platform soil development management irrigation proposed Topic 9: iot water system irrigation farmers data agriculture farming management agricultural
De acuerdo con esto los topicos son:
Ya que se selecciono el mejor modelo y el mejor número de topicos, es tiempo de asignar tópicos a cada uno de los registros lo cual se puede ver como un cluster de acuerdo con los topicos.
%%time
def format_topics_sentences(ldamodel=0, corpus=corpus, texts=0):
# Output inicial
sent_topics_df = pd.DataFrame()-n
# Obtener el topico mas importante en cada documento
for i, row in enumerate(ldamodel[corpus]):
row = sorted(row, key=lambda x: (x[1]), reverse=True)
# Obtener el topico dominante por contribucion y palabras clave para cada documento
for j, (topic_num, prop_topic) in enumerate(row):
if j == 0: # => topico dominante
wp = ldamodel.show_topic(topic_num)
topic_keywords = ", ".join([word for word, prop in wp])
sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
else:
break
sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
# Adicionar el texto original al final del output
contents = pd.Series(texts)
sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
return(sent_topics_df)
df_topic_sents_keywords = format_topics_sentences(ldamodel, corpus=corpus, texts=data['Abstract_sin_stopwords2'].to_list())
'''
200000 registros = 33 min 43 sec
250000 registros = 49 min 32 sec
'''
Wall time: 164 ms
'\n200000 registros = 33 min 43 sec\n250000 registros = 49 min 32 sec\n'
# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
# Show
df_dominant_topic.head(10)
Document_No | Dominant_Topic | Topic_Perc_Contrib | Keywords | Text | |
---|---|---|---|---|---|
0 | 0 | 7.0 | 0.9894 | agriculture, water, iot, irrigation, system, b... | [agriculture, considered, one, major, sources,... |
1 | 1 | 3.0 | 0.7507 | irrigation, system, iot, water, data, manageme... | [automatic, irrigation, scheduling, systems, h... |
2 | 2 | 2.0 | 0.7769 | water, irrigation, system, iot, agriculture, d... | [iot, iot, plays, vital, role, entity, sharing... |
3 | 3 | 3.0 | 0.6097 | irrigation, system, iot, water, data, manageme... | [new, technologies, potential, transform, agri... |
4 | 4 | 1.0 | 0.9852 | water, system, irrigation, iot, systems, using... | [technological, innovations, affect, human, ac... |
5 | 5 | 9.0 | 0.9916 | iot, water, system, irrigation, farmers, data,... | [scarcity, desertification, considered, among,... |
6 | 6 | 5.0 | 0.9899 | system, irrigation, soil, agriculture, data, w... | [center, pivot, systems, widely, used, overcom... |
7 | 7 | 1.0 | 0.5525 | water, system, irrigation, iot, systems, using... | [agriculture, wireless, sensor, networks, lora... |
8 | 8 | 3.0 | 0.9899 | irrigation, system, iot, water, data, manageme... | [article, present, design, iot, dynamic, irrig... |
9 | 9 | 2.0 | 0.9920 | water, irrigation, system, iot, agriculture, d... | [various, advancements, made, numerous, domain... |
df_dominant_topic['Dominant_Topic'].unique()
array([7., 3., 2., 1., 9., 5., 0., 8., 4., 6.])
De acuerdo con esto los topicos son:
Podemos crear un diccionario con los topicos resumidos en una palabra para crear una funcion y hacer un mapeo sobre los numeros.
label_dicc = {0:'Topic 1', 1:'Topic 2', 2:'Topic 3', 3: 'Topic 4', 4:'Topic 5', 5:'Topic 6', 6:'Topic 7', 7:"Topic 8", 8: "Topic 9"
}
df_dominant_topic['Dominant_Topic'] = df_dominant_topic['Dominant_Topic'].astype('int64')
df_dominant_topic['Dominant_Topic'] = df_dominant_topic['Dominant_Topic'].map(label_dicc)
df_dominant_topic.head(10)
Document_No | Dominant_Topic | Topic_Perc_Contrib | Keywords | Text | |
---|---|---|---|---|---|
0 | 0 | Topic 8 | 0.9894 | agriculture, water, iot, irrigation, system, b... | [agriculture, considered, one, major, sources,... |
1 | 1 | Topic 4 | 0.7507 | irrigation, system, iot, water, data, manageme... | [automatic, irrigation, scheduling, systems, h... |
2 | 2 | Topic 3 | 0.7769 | water, irrigation, system, iot, agriculture, d... | [iot, iot, plays, vital, role, entity, sharing... |
3 | 3 | Topic 4 | 0.6097 | irrigation, system, iot, water, data, manageme... | [new, technologies, potential, transform, agri... |
4 | 4 | Topic 2 | 0.9852 | water, system, irrigation, iot, systems, using... | [technological, innovations, affect, human, ac... |
5 | 5 | NaN | 0.9916 | iot, water, system, irrigation, farmers, data,... | [scarcity, desertification, considered, among,... |
6 | 6 | Topic 6 | 0.9899 | system, irrigation, soil, agriculture, data, w... | [center, pivot, systems, widely, used, overcom... |
7 | 7 | Topic 2 | 0.5525 | water, system, irrigation, iot, systems, using... | [agriculture, wireless, sensor, networks, lora... |
8 | 8 | Topic 4 | 0.9899 | irrigation, system, iot, water, data, manageme... | [article, present, design, iot, dynamic, irrig... |
9 | 9 | Topic 3 | 0.9920 | water, irrigation, system, iot, agriculture, d... | [various, advancements, made, numerous, domain... |
data['labels'] = df_dominant_topic['Dominant_Topic']
data['Topic_Perc_Contrib'] = df_dominant_topic['Topic_Perc_Contrib']
data['Keywords'] = df_dominant_topic['Keywords']
data['Text_new'] = df_dominant_topic['Text']
print(df_dominant_topic.shape)
print(data.shape)
(68, 5) (68, 15)
df_dominant_topic.head()
Document_No | Dominant_Topic | Topic_Perc_Contrib | Keywords | Text | |
---|---|---|---|---|---|
0 | 0 | Topic 8 | 0.9894 | agriculture, water, iot, irrigation, system, b... | [agriculture, considered, one, major, sources,... |
1 | 1 | Topic 4 | 0.7507 | irrigation, system, iot, water, data, manageme... | [automatic, irrigation, scheduling, systems, h... |
2 | 2 | Topic 3 | 0.7769 | water, irrigation, system, iot, agriculture, d... | [iot, iot, plays, vital, role, entity, sharing... |
3 | 3 | Topic 4 | 0.6097 | irrigation, system, iot, water, data, manageme... | [new, technologies, potential, transform, agri... |
4 | 4 | Topic 2 | 0.9852 | water, system, irrigation, iot, systems, using... | [technological, innovations, affect, human, ac... |
# Escribir el CSV con los datos de interes
df_dominant_topic.to_csv('df_dominant_topic_2.csv')
%%time
data.to_csv('data_topics_2.csv')
Wall time: 59 ms
data["Abstract_clean"]=data["Abstract_sin_stopwords2"]
data[['Abstract_clean', 'labels']].head(10)
Abstract_clean | labels | |
---|---|---|
0 | [agriculture, considered, one, major, sources,... | Topic 8 |
1 | [automatic, irrigation, scheduling, systems, h... | Topic 4 |
2 | [iot, iot, plays, vital, role, entity, sharing... | Topic 3 |
3 | [new, technologies, potential, transform, agri... | Topic 4 |
4 | [technological, innovations, affect, human, ac... | Topic 2 |
5 | [scarcity, desertification, considered, among,... | NaN |
6 | [center, pivot, systems, widely, used, overcom... | Topic 6 |
7 | [agriculture, wireless, sensor, networks, lora... | Topic 2 |
8 | [article, present, design, iot, dynamic, irrig... | Topic 4 |
9 | [various, advancements, made, numerous, domain... | Topic 3 |
#Para hacer filtro de interés
data[data['labels'] == 'Topic 6'].head().Abstract_clean
6 [center, pivot, systems, widely, used, overcom... 20 [last, years, existing, technologies, applied,... 44 [submitted, 17, november, 2020, accepted, 13, ... 48 [study, propose, smart, plant, irrigation, iot... 54 [agriculture, one, main, sources, income, rura... Name: Abstract_clean, dtype: object
ax = df_dominant_topic['Dominant_Topic'].value_counts().plot(kind='bar')
plt.ylabel('Frecuencia')
plt.title('Distribución de tópicos en el corpus textual')
plt.show()
import matplotlib.pyplot as plt
#fig = plt.figure()
#ax = fig.add_axes([0,0,1,1])
langs = ['Topic 1', 'Topic 2', 'Topic 3', 'Topic 4', 'Topic 5', 'Topic 6', 'Topic 7', 'Topic 8', 'Topic 9']
articles = [8,5,10,18,2, 7, 6, 2, 3]
plt.bar(langs,articles,width=0.55)
plt.xticks(rotation=90)
plt.ylabel('Frecuencia')
plt.title('Distribución de tópicos en el corpus textual')
#plt.grid(b=None)
plt.show()
!pip install pyLDAvis
Requirement already satisfied: pyLDAvis in c:\users\suzak\anaconda3\lib\site-packages (3.3.1) Requirement already satisfied: pandas>=1.2.0 in c:\users\suzak\anaconda3\lib\site-packages (from pyLDAvis) (1.3.4) Requirement already satisfied: numexpr in c:\users\suzak\anaconda3\lib\site-packages (from pyLDAvis) (2.7.1) Requirement already satisfied: scipy in c:\users\suzak\anaconda3\lib\site-packages (from pyLDAvis) (1.5.2) Requirement already satisfied: future in c:\users\suzak\anaconda3\lib\site-packages (from pyLDAvis) (0.18.2) Requirement already satisfied: joblib in c:\users\suzak\anaconda3\lib\site-packages (from pyLDAvis) (0.17.0) Requirement already satisfied: setuptools in c:\users\suzak\anaconda3\lib\site-packages (from pyLDAvis) (50.3.1.post20201107) Requirement already satisfied: jinja2 in c:\users\suzak\anaconda3\lib\site-packages (from pyLDAvis) (2.11.2) Requirement already satisfied: numpy>=1.20.0 in c:\users\suzak\anaconda3\lib\site-packages (from pyLDAvis) (1.21.3) Requirement already satisfied: funcy in c:\users\suzak\anaconda3\lib\site-packages (from pyLDAvis) (1.16) Requirement already satisfied: sklearn in c:\users\suzak\anaconda3\lib\site-packages (from pyLDAvis) (0.0) Requirement already satisfied: gensim in c:\users\suzak\anaconda3\lib\site-packages (from pyLDAvis) (4.1.2) Requirement already satisfied: scikit-learn in c:\users\suzak\anaconda3\lib\site-packages (from pyLDAvis) (0.23.2) Requirement already satisfied: pytz>=2017.3 in c:\users\suzak\anaconda3\lib\site-packages (from pandas>=1.2.0->pyLDAvis) (2020.1) Requirement already satisfied: python-dateutil>=2.7.3 in c:\users\suzak\anaconda3\lib\site-packages (from pandas>=1.2.0->pyLDAvis) (2.8.1) Requirement already satisfied: MarkupSafe>=0.23 in c:\users\suzak\anaconda3\lib\site-packages (from jinja2->pyLDAvis) (1.1.1) Requirement already satisfied: smart-open>=1.8.1 in c:\users\suzak\anaconda3\lib\site-packages (from gensim->pyLDAvis) (5.2.1) Requirement already satisfied: Cython==0.29.23 in c:\users\suzak\anaconda3\lib\site-packages (from gensim->pyLDAvis) (0.29.23) Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\suzak\anaconda3\lib\site-packages (from scikit-learn->pyLDAvis) (2.1.0) Requirement already satisfied: six>=1.5 in c:\users\suzak\anaconda3\lib\site-packages (from python-dateutil>=2.7.3->pandas>=1.2.0->pyLDAvis) (1.12.0)
len(df_dominant_topic.Text)
68
# distribucion de frecuencia de word counts
import seaborn as sns
doc_lens = [len(d) for d in df_dominant_topic.Text]
# Plot
'''
#plt.figure(figsize=(20,20),dpi=72)
plt.hist(doc_lens, bins = 25, color='navy')
plt.text(80, 50000, "Mean : " + str(round(np.mean(doc_lens))))
plt.text(80, 45000, "Median : " + str(round(np.median(doc_lens))))
plt.text(80, 40000, "Stdev : " + str(round(np.std(doc_lens))))
plt.text(80, 35000, "1 percentile : " + str(round(np.quantile(doc_lens, q=0.01))))
plt.text(80, 30000, "99 percentile : " + str(round(np.quantile(doc_lens, q=0.99))))
plt.xlim(0,125)
#plt.gca().set(xlim=(750, 875), ylabel='Number of Documents', xlabel='Document Word Count')
#plt.tick_params(size=16)
#plt.xticks(np.linspace(750,875,9))
#plt.title('Distribution of Document Word Counts', fontdict=dict(size=22))
plt.title('Distribucion de Conteos de palabras', fontdict=dict(size=16))
plt.ylabel('Número de registoros')
plt.xlabel('Conteo de palabras por documentos')
'''
fig=plt.figure(figsize=(10,10))
ax = fig.add_subplot(1,1,1) # create an axes object in the figure
ax.grid(b=None)
ax2=ax.twinx()
ax2.grid(b=None)
ax2.axes.get_yaxis().set_visible(False)
ax.hist(doc_lens, bins = 25, color='navy')
ax.tick_params(axis='y', labelcolor='black', color='black')
ax.text(10, 12, "Mean : " + str(round(np.mean(doc_lens))))
ax.text(10, 11, "Median : " + str(round(np.median(doc_lens))))
ax.text(10, 10, "Stdev : " + str(round(np.std(doc_lens))))
ax.text(10, 9, "1 percentile : " + str(round(np.quantile(doc_lens, q=0.01))))
ax.text(10, 8, "99 percentile : " + str(round(np.quantile(doc_lens, q=0.99))))
sns.kdeplot(doc_lens, color="black", shade=False,ax=ax2)
ax.set(xlim=(0, 120), xlabel='Conteo de palabras por documentos')
ax.set_ylabel('Número de registros')
ax.set_title('Distribución de Conteos de palabras')
ax.set_xlabel('Conteo de palabras por documentos')
plt.show()
# Cambiar el formato de Topico dominante a numerico otra vez
# Formato
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
# Show
df_dominant_topic.head(10)
Document_No | Dominant_Topic | Topic_Perc_Contrib | Keywords | Text | |
---|---|---|---|---|---|
0 | 0 | 7.0 | 0.9894 | agriculture, water, iot, irrigation, system, b... | [agriculture, considered, one, major, sources,... |
1 | 1 | 3.0 | 0.7507 | irrigation, system, iot, water, data, manageme... | [automatic, irrigation, scheduling, systems, h... |
2 | 2 | 2.0 | 0.7769 | water, irrigation, system, iot, agriculture, d... | [iot, iot, plays, vital, role, entity, sharing... |
3 | 3 | 3.0 | 0.6097 | irrigation, system, iot, water, data, manageme... | [new, technologies, potential, transform, agri... |
4 | 4 | 1.0 | 0.9852 | water, system, irrigation, iot, systems, using... | [technological, innovations, affect, human, ac... |
5 | 5 | 9.0 | 0.9916 | iot, water, system, irrigation, farmers, data,... | [scarcity, desertification, considered, among,... |
6 | 6 | 5.0 | 0.9899 | system, irrigation, soil, agriculture, data, w... | [center, pivot, systems, widely, used, overcom... |
7 | 7 | 1.0 | 0.5525 | water, system, irrigation, iot, systems, using... | [agriculture, wireless, sensor, networks, lora... |
8 | 8 | 3.0 | 0.9899 | irrigation, system, iot, water, data, manageme... | [article, present, design, iot, dynamic, irrig... |
9 | 9 | 2.0 | 0.9920 | water, irrigation, system, iot, agriculture, d... | [various, advancements, made, numerous, domain... |
label_dicc1 = {0:1, 1:2, 2:3, 3:4, 4:5,\
5:6, 6:7, 7:8,8:9
}
df_dominant_topic['Dominant_Topic'] = df_dominant_topic['Dominant_Topic'].map(label_dicc1)
df_dominant_topic.head(10)
Document_No | Dominant_Topic | Topic_Perc_Contrib | Keywords | Text | |
---|---|---|---|---|---|
0 | 0 | 8.0 | 0.9894 | agriculture, water, iot, irrigation, system, b... | [agriculture, considered, one, major, sources,... |
1 | 1 | 4.0 | 0.7507 | irrigation, system, iot, water, data, manageme... | [automatic, irrigation, scheduling, systems, h... |
2 | 2 | 3.0 | 0.7769 | water, irrigation, system, iot, agriculture, d... | [iot, iot, plays, vital, role, entity, sharing... |
3 | 3 | 4.0 | 0.6097 | irrigation, system, iot, water, data, manageme... | [new, technologies, potential, transform, agri... |
4 | 4 | 2.0 | 0.9852 | water, system, irrigation, iot, systems, using... | [technological, innovations, affect, human, ac... |
5 | 5 | NaN | 0.9916 | iot, water, system, irrigation, farmers, data,... | [scarcity, desertification, considered, among,... |
6 | 6 | 6.0 | 0.9899 | system, irrigation, soil, agriculture, data, w... | [center, pivot, systems, widely, used, overcom... |
7 | 7 | 2.0 | 0.5525 | water, system, irrigation, iot, systems, using... | [agriculture, wireless, sensor, networks, lora... |
8 | 8 | 4.0 | 0.9899 | irrigation, system, iot, water, data, manageme... | [article, present, design, iot, dynamic, irrig... |
9 | 9 | 3.0 | 0.9920 | water, irrigation, system, iot, agriculture, d... | [various, advancements, made, numerous, domain... |
df_dominant_topic.Dominant_Topic.unique()
array([ 8., 4., 3., 2., nan, 6., 1., 9., 5., 7.])
import seaborn as sns
import matplotlib.colors as mcolors
import random
fig=plt.figure(figsize=(20,20))
for i in range(9):
#plt.subplot(4,4,i+1)
#fig = plt.figure()
ax = fig.add_subplot(4,4,i+1) # create an axes object in the figure
ax.grid(b=None)
ax2=ax.twinx()
ax2.grid(b=None)
ax2.axes.get_yaxis().set_visible(False) # Esconder el label del segundo eje de densidad
r = random.random()
b = random.random()
g = random.random()
color = (r, g, b)
df_dominant_topic_sub = df_dominant_topic.loc[df_dominant_topic.Dominant_Topic == i+1, :]
doc_lens = [len(d) for d in df_dominant_topic_sub.Text]
ax.hist(doc_lens, bins = 25,color=color)
ax.tick_params(axis='y', labelcolor=color, color=color)
sns.kdeplot(doc_lens, color="black", shade=False,ax=ax2)
ax.set(xlim=(0, 125), xlabel='Conteo de palabras por documentos')
ax.set_ylabel('Número de registros', color=color)
ax.set_xlabel('Conteo de palabras por documentos', color=color)
ax.set_title('Topic: '+str(i+1), fontdict=dict(size=17, color=color))
#ax.xlim(0,125)
#plt.title('Distribucion de Conteos de palabras', fontdict=dict(size=16))
#ax.ylabel('Frecuencia',color=color)
#ax.xlabel('Conteo de palabras por documentos',color=color)
#ax.set_ylabel('Number of Documents', color=cols[i])
#plt.title('Topic: '+str(i+1), fontdict=dict(size=17, color=color))
#plt.text(100, 15000, "Mean : " + str(round(np.mean(doc_lens))))
#plt.text(100, 14000, "Median : " + str(round(np.median(doc_lens))))
#plt.text(100, 13000, "Stdev : " + str(round(np.std(doc_lens))))
#plt.text(100, 12000, "1 percentile : " + str(round(np.quantile(doc_lens, q=0.01))))
#plt.text(100, 11000, "99 percentile : " + str(round(np.quantile(doc_lens, q=0.99))))
plt.tight_layout()
# Lectura - Database (Formato Excel)
import time
start_time = time.time()
import pandas as pd
df_dominant_topic=pd.read_csv('df_dominant_topic_2.csv',sep=',')
#/content/gdrive/My Drive/datatf_labels_5pca_6cluster.csv
#data.head()
print("--- %s seconds ---" % (time.time() - start_time))
df_dominant_topic.head() # Visualización de dataset
--- 0.004001140594482422 seconds ---
Unnamed: 0 | Document_No | Dominant_Topic | Topic_Perc_Contrib | Keywords | Text | |
---|---|---|---|---|---|---|
0 | 0 | 0 | Topic 8 | 0.9894 | agriculture, water, iot, irrigation, system, b... | ['agriculture', 'considered', 'one', 'major', ... |
1 | 1 | 1 | Topic 4 | 0.7507 | irrigation, system, iot, water, data, manageme... | ['automatic', 'irrigation', 'scheduling', 'sys... |
2 | 2 | 2 | Topic 3 | 0.7769 | water, irrigation, system, iot, agriculture, d... | ['iot', 'iot', 'plays', 'vital', 'role', 'enti... |
3 | 3 | 3 | Topic 4 | 0.6097 | irrigation, system, iot, water, data, manageme... | ['new', 'technologies', 'potential', 'transfor... |
4 | 4 | 4 | Topic 2 | 0.9852 | water, system, irrigation, iot, systems, using... | ['technological', 'innovations', 'affect', 'hu... |
label_dicc1 = {'Topic 1':1, 'Topic 2':2, 'Topic 3':3, 'Topic 4':4, 'Topic 5':5,\
'Topic 6':6, 'Topic 7':7, 'Topic 8':8, 'Topic 9':9
}
df_dominant_topic['Dominant_Topic'] = df_dominant_topic['Dominant_Topic'].map(label_dicc1)
df_dominant_topic.head(10)
Unnamed: 0 | Document_No | Dominant_Topic | Topic_Perc_Contrib | Keywords | Text | |
---|---|---|---|---|---|---|
0 | 0 | 0 | 8.0 | 0.9894 | agriculture, water, iot, irrigation, system, b... | ['agriculture', 'considered', 'one', 'major', ... |
1 | 1 | 1 | 4.0 | 0.7507 | irrigation, system, iot, water, data, manageme... | ['automatic', 'irrigation', 'scheduling', 'sys... |
2 | 2 | 2 | 3.0 | 0.7769 | water, irrigation, system, iot, agriculture, d... | ['iot', 'iot', 'plays', 'vital', 'role', 'enti... |
3 | 3 | 3 | 4.0 | 0.6097 | irrigation, system, iot, water, data, manageme... | ['new', 'technologies', 'potential', 'transfor... |
4 | 4 | 4 | 2.0 | 0.9852 | water, system, irrigation, iot, systems, using... | ['technological', 'innovations', 'affect', 'hu... |
5 | 5 | 5 | NaN | 0.9916 | iot, water, system, irrigation, farmers, data,... | ['scarcity', 'desertification', 'considered', ... |
6 | 6 | 6 | 6.0 | 0.9899 | system, irrigation, soil, agriculture, data, w... | ['center', 'pivot', 'systems', 'widely', 'used... |
7 | 7 | 7 | 2.0 | 0.5525 | water, system, irrigation, iot, systems, using... | ['agriculture', 'wireless', 'sensor', 'network... |
8 | 8 | 8 | 4.0 | 0.9899 | irrigation, system, iot, water, data, manageme... | ['article', 'present', 'design', 'iot', 'dynam... |
9 | 9 | 9 | 3.0 | 0.9920 | water, irrigation, system, iot, agriculture, d... | ['various', 'advancements', 'made', 'numerous'... |
df_dominant_topic['Text2'] = df_dominant_topic['Text'].apply(lambda x: x[1:-1])
df_dominant_topic['Text2'].head()
0 'agriculture', 'considered', 'one', 'major', '... 1 'automatic', 'irrigation', 'scheduling', 'syst... 2 'iot', 'iot', 'plays', 'vital', 'role', 'entit... 3 'new', 'technologies', 'potential', 'transform... 4 'technological', 'innovations', 'affect', 'hum... Name: Text2, dtype: object
def pre_process(text):
text=text.lower()
text = re.sub('\[.*?¿\]\%', ' ',text) #remover caracteres especiales
text = re.sub('[%s]' % re.escape(string.punctuation), ' ',text) #remover puntuación
text = re.sub('\w*\d\w*', '',text) #remover números entre palabras
text = re.sub(r'\b(\w+)( \1\b)+', r'\1', text) #remover palabras duplicadas consecutivas
text = re.sub(r',', '', text) # Remover comas
text = re.sub(r'\-', ' ', text) # Remover guiones
text = re.sub(r'á', 'a', text) # remover tildes
text = re.sub(r'é', 'e', text) # remover tildes
text = re.sub(r'í', 'i', text) # remover tildes
text = re.sub(r'ó', 'o', text) # remover tildes
text = re.sub(r'ú', 'u', text) # remover tildes
text = re.sub('\[.*?¿\]\%', ' ',text) #remover caracteres especiales
text = re.sub('\w*\d\w*', '',text) #remover números entre palabras
text = re.sub('[‘’“”…«»]', '',text)#remover caracteres adicionales
text = re.sub('\n', ' ',text) #remover newlines
return text
df_dominant_topic['Text2'] = df_dominant_topic['Text2'].apply(lambda x:pre_process(x))
df_dominant_topic['Text2'].head()
0 agriculture considered one major ... 1 automatic irrigation scheduling syst... 2 iot iot plays vital role entit... 3 new technologies potential transform... 4 technological innovations affect hum... Name: Text2, dtype: object
df_dominant_topic['Text2']=df_dominant_topic.Text2.str.replace(' ', ' ')
df_dominant_topic['Text2'].head()
0 agriculture considered one major sources main... 1 automatic irrigation scheduling systems highl... 2 iot iot plays vital role entity sharing minim... 3 new technologies potential transform agricult... 4 technological innovations affect human activi... Name: Text2, dtype: object
from sklearn import metrics
import time
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
start_time = time.time()
def sort_coo(coo_matrix):
tuples = zip(coo_matrix.col, coo_matrix.data)
return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
"""Obtener los feature names y el tf-idf score de las top n palabras"""
#Para este caso se usa solo el top n de items para el vector
sorted_items = sorted_items[:topn]
score_vals = []
feature_vals = []
# Indice de palabras correspondiente al tf_idf score
for idx, score in sorted_items:
#Esto es para mantener el orden entre el feature y el score correspondiente
score_vals.append(round(score, 3))
feature_vals.append(feature_names[idx])
#Crear las tuplas de feature,score
#resultados = zip(feature_vals,score_vals)
results= {}
for idx in range(len(feature_vals)):
results[feature_vals[idx]]=score_vals[idx]
return results
for i in range(9):
locals()["df" + str(i+1)] = df_dominant_topic[df_dominant_topic['Dominant_Topic']==i+1]
locals()["docs" + str(i+1)]=locals()["df" + str(i+1)]['Text2'].tolist()
locals()["cv" + str(i+1)]=CountVectorizer(max_df=0.50,max_features=10000)
locals()["word_count_vector" + str(i+1)]=locals()["cv" + str(i+1)].fit_transform(locals()["docs" + str(i+1)])
locals()["tfidf_transformer" + str(i+1)]=TfidfTransformer(smooth_idf=True,use_idf=True)
locals()["tfidf_transformer" + str(i+1)].fit(locals()["word_count_vector" + str(i+1)])
locals()["feature_names" + str(i+1)]=locals()["cv" + str(i+1)].get_feature_names()
locals()["doc" + str(i+1)]=str(locals()["docs" + str(i+1)][0:273787])
locals()["tf_idf_vector" + str(i+1)]=locals()["tfidf_transformer" + str(i+1)].transform(locals()["cv" + str(i+1)].transform([locals()["doc" + str(i+1)]]))
locals()["sorted_items" + str(i+1)]=sort_coo(locals()["tf_idf_vector" + str(i+1)].tocoo())
locals()["keywords" + str(i+1)]=extract_topn_from_vector(locals()["feature_names" + str(i+1)],locals()["sorted_items" + str(i+1)],100)
locals()["tuplex" + str(i+1)]=locals()["keywords" + str(i+1)].items()
locals()["tuple_listax" + str(i+1)]=list(locals()["tuplex" + str(i+1)])
locals()["tuple_listax" + str(i+1)]=sorted(locals()["tuple_listax" + str(i+1)], key = lambda x: x[1], reverse=True)
print("--- Tiempo de ejecucion fue de : %s seconds ---" % (time.time() - start_time))
--- Tiempo de ejecucion fue de : 0.03602886199951172 seconds ---
# Crear el histograma perspectiva 1 (palabras simples)
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 10})
plt.figure(figsize=(27,32))
for i in range(9):
plt.subplot(3,4,i+1)
palabras = [a_tuple[0] for a_tuple in locals()["tuple_listax" + str(i+1)]]
frecuencias = [a_tuple[1] for a_tuple in locals()["tuple_listax" + str(i+1)]]
plt.bar(palabras[0:30],frecuencias[0:30])
plt.xticks(rotation=90)
plt.ylabel('TF-IDF score')
plt.title('Histograma de palabras TF-IDF Topic: '+str(i+1))
#plt.ylim(0,0.30)
# Crear el histograma perspectiva 1 (palabras simples)
from wordcloud import WordCloud
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 10})
#j = np.ceil(11/4)
#max_font_size=256, max_words=100, background_color="white",width=3000,height=1500
plt.figure(figsize=(12,12))
for i in range(9):
#plt.subplot(j, 4, i+1).set_title("Topic #" + str(i+1))
plt.subplot(3,4,i+1)
wordcloud = WordCloud(max_words=100, background_color="white",width = 3000, height = 2000, random_state=1).generate_from_frequencies(locals()["keywords" + str(i+1)])
plt.imshow(wordcloud,interpolation="bilinear")
plt.title('Wordcloud Topic: '+str(i+1))
plt.axis("off")
#plt.show()
from wordcloud import WordCloud
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 10})
#j = np.ceil(11/4)
#max_font_size=256, max_words=100, background_color="white",width=3000,height=1500
plt.figure(figsize=(35,25))
plt.subplot(341)
palabras = [a_tuple[0] for a_tuple in locals()["tuple_listax" + str(1)]]
frecuencias = [a_tuple[1] for a_tuple in locals()["tuple_listax" + str(1)]]
plt.bar(palabras[0:30],frecuencias[0:30])
plt.xticks(rotation=90)
plt.grid(b=None)
plt.ylabel('TF-IDF score')
plt.title('Histograma de palabras TF-IDF Topic: '+str(1))
plt.subplot(342)
wordcloud = WordCloud(max_words=100, background_color="white").generate_from_frequencies(locals()["keywords" + str(1)])
plt.imshow(wordcloud,interpolation="bilinear")
plt.title('Wordcloud Topic: '+str(1))
plt.axis("off")
(-0.5, 399.5, 199.5, -0.5)
plt.figure(figsize=(35,25))
plt.subplot(341)
palabras = [a_tuple[0] for a_tuple in locals()["tuple_listax" + str(2)]]
frecuencias = [a_tuple[1] for a_tuple in locals()["tuple_listax" + str(2)]]
plt.bar(palabras[0:30],frecuencias[0:30])
plt.xticks(rotation=90)
plt.grid(b=None)
plt.ylabel('TF-IDF score')
plt.title('Histograma de palabras TF-IDF Topic: '+str(2))
plt.subplot(342)
wordcloud = WordCloud(max_words=100, background_color="white").generate_from_frequencies(locals()["keywords" + str(2)])
plt.imshow(wordcloud,interpolation="bilinear")
plt.title('Wordcloud Topic: '+str(2))
plt.axis("off")
(-0.5, 399.5, 199.5, -0.5)
plt.figure(figsize=(35,25))
plt.subplot(341)
palabras = [a_tuple[0] for a_tuple in locals()["tuple_listax" + str(3)]]
frecuencias = [a_tuple[1] for a_tuple in locals()["tuple_listax" + str(3)]]
plt.bar(palabras[0:30],frecuencias[0:30])
plt.xticks(rotation=90)
plt.grid(b=None)
plt.ylabel('TF-IDF score')
plt.title('Histograma de palabras TF-IDF Topic: '+str(3))
plt.subplot(342)
wordcloud = WordCloud(max_words=100, background_color="white").generate_from_frequencies(locals()["keywords" + str(3)])
plt.imshow(wordcloud,interpolation="bilinear")
plt.title('Wordcloud Topic: '+str(3))
plt.axis("off")
(-0.5, 399.5, 199.5, -0.5)
plt.figure(figsize=(35,25))
plt.subplot(341)
palabras = [a_tuple[0] for a_tuple in locals()["tuple_listax" + str(4)]]
frecuencias = [a_tuple[1] for a_tuple in locals()["tuple_listax" + str(4)]]
plt.bar(palabras[0:30],frecuencias[0:30])
plt.xticks(rotation=90)
plt.grid(b=None)
plt.ylabel('TF-IDF score')
plt.title('Histograma de palabras TF-IDF Topic: '+str(4))
plt.subplot(342)
wordcloud = WordCloud(max_words=100, background_color="white").generate_from_frequencies(locals()["keywords" + str(4)])
plt.imshow(wordcloud,interpolation="bilinear")
plt.title('Wordcloud Topic: '+str(4))
plt.axis("off")
(-0.5, 399.5, 199.5, -0.5)
plt.figure(figsize=(35,25))
plt.subplot(341)
palabras = [a_tuple[0] for a_tuple in locals()["tuple_listax" + str(5)]]
frecuencias = [a_tuple[1] for a_tuple in locals()["tuple_listax" + str(5)]]
plt.bar(palabras[0:30],frecuencias[0:30])
plt.xticks(rotation=90)
plt.grid(b=None)
plt.ylabel('TF-IDF score')
plt.title('Histograma de palabras TF-IDF Topic: '+str(5))
plt.subplot(342)
wordcloud = WordCloud(max_words=100, background_color="white").generate_from_frequencies(locals()["keywords" + str(5)])
plt.imshow(wordcloud,interpolation="bilinear")
plt.title('Wordcloud Topic: '+str(5))
plt.axis("off")
(-0.5, 399.5, 199.5, -0.5)
plt.figure(figsize=(35,25))
plt.subplot(341)
palabras = [a_tuple[0] for a_tuple in locals()["tuple_listax" + str(6)]]
frecuencias = [a_tuple[1] for a_tuple in locals()["tuple_listax" + str(6)]]
plt.bar(palabras[0:30],frecuencias[0:30])
plt.xticks(rotation=90)
plt.grid(b=None)
plt.ylabel('TF-IDF score')
plt.title('Histograma de palabras TF-IDF Topic: '+str(6))
plt.subplot(342)
wordcloud = WordCloud(max_words=100, background_color="white").generate_from_frequencies(locals()["keywords" + str(6)])
plt.imshow(wordcloud,interpolation="bilinear")
plt.title('Wordcloud Topic: '+str(6))
plt.axis("off")
(-0.5, 399.5, 199.5, -0.5)
plt.figure(figsize=(35,25))
plt.subplot(341)
palabras = [a_tuple[0] for a_tuple in locals()["tuple_listax" + str(7)]]
frecuencias = [a_tuple[1] for a_tuple in locals()["tuple_listax" + str(7)]]
plt.bar(palabras[0:30],frecuencias[0:30])
plt.xticks(rotation=90)
plt.grid(b=None)
plt.ylabel('TF-IDF score')
plt.title('Histograma de palabras TF-IDF Topic: '+str(7))
plt.subplot(342)
wordcloud = WordCloud(max_words=100, background_color="white").generate_from_frequencies(locals()["keywords" + str(7)])
plt.imshow(wordcloud,interpolation="bilinear")
plt.title('Wordcloud Topic: '+str(7))
plt.axis("off")
(-0.5, 399.5, 199.5, -0.5)
plt.figure(figsize=(35,25))
plt.subplot(341)
palabras = [a_tuple[0] for a_tuple in locals()["tuple_listax" + str(8)]]
frecuencias = [a_tuple[1] for a_tuple in locals()["tuple_listax" + str(8)]]
plt.bar(palabras[0:30],frecuencias[0:30])
plt.xticks(rotation=90)
plt.grid(b=None)
plt.ylabel('TF-IDF score')
plt.title('Histograma de palabras TF-IDF Topic: '+str(8))
plt.subplot(342)
wordcloud = WordCloud(max_words=100, background_color="white").generate_from_frequencies(locals()["keywords" + str(8)])
plt.imshow(wordcloud,interpolation="bilinear")
plt.title('Wordcloud Topic: '+str(8))
plt.axis("off")
(-0.5, 399.5, 199.5, -0.5)
plt.figure(figsize=(35,25))
plt.subplot(341)
palabras = [a_tuple[0] for a_tuple in locals()["tuple_listax" + str(9)]]
frecuencias = [a_tuple[1] for a_tuple in locals()["tuple_listax" + str(9)]]
plt.bar(palabras[0:30],frecuencias[0:30])
plt.xticks(rotation=90)
plt.grid(b=None)
plt.ylabel('TF-IDF score')
plt.title('Histograma de palabras TF-IDF Topic: '+str(9))
plt.subplot(342)
wordcloud = WordCloud(max_words=100, background_color="white").generate_from_frequencies(locals()["keywords" + str(9)])
plt.imshow(wordcloud,interpolation="bilinear")
plt.title('Wordcloud Topic: '+str(9))
plt.axis("off")
(-0.5, 399.5, 199.5, -0.5)
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 10})
plt.figure(figsize=(27,32))
for i in range(9): #Hasta la cantidad de tópicos
plt.subplot(3,4,i+1)
palabras = [a_tuple[0] for a_tuple in locals()["tuple_listax" + str(i+1)]]
frecuencias = [a_tuple[1] for a_tuple in locals()["tuple_listax" + str(i+1)]]
plt.bar(palabras[0:30],frecuencias[0:30])
plt.xticks(rotation=90)
plt.ylabel('TF-IDF score')
plt.title('Histograma de palabras TF-IDF Topic: '+str(i+1))
#plt.ylim(0,0.30)
print(df1.shape)
print(df2.shape)
print(df3.shape)
print(df4.shape)
print(df5.shape)
print(df6.shape)
print(df7.shape)
print(df8.shape)
print(df9.shape)
(8, 7) (5, 7) (10, 7) (18, 7) (2, 7) (7, 7) (6, 7) (2, 7) (3, 7)
from sklearn import metrics
import time
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
start_time = time.time()
def sort_coo(coo_matrix):
tuples = zip(coo_matrix.col, coo_matrix.data)
return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
"""Obtener los feature names y el tf-idf score de las top n palabras"""
#Para este caso se usa solo el top n de items para el vector
sorted_items = sorted_items[:topn]
score_vals = []
feature_vals = []
# Indice de palabras correspondiente al tf_idf score
for idx, score in sorted_items:
#Esto es para mantener el orden entre el feature y el score correspondiente
score_vals.append(round(score, 3))
feature_vals.append(feature_names[idx])
#Crear las tuplas de feature,score
#resultados = zip(feature_vals,score_vals)
results= {}
for idx in range(len(feature_vals)):
results[feature_vals[idx]]=score_vals[idx]
return results
for i in range(9): #Para 9 tópicos
locals()["df" + str(i+1)] = df_dominant_topic[df_dominant_topic['Dominant_Topic']==i+1]
locals()["docs" + str(i+1)]=locals()["df" + str(i+1)]['Text2'].tolist()
locals()["cv" + str(i+1)]=CountVectorizer(max_df=0.50,max_features=10000,ngram_range = (2,2))
locals()["word_count_vector" + str(i+1)]=locals()["cv" + str(i+1)].fit_transform(locals()["docs" + str(i+1)])
locals()["tfidf_transformer" + str(i+1)]=TfidfTransformer(smooth_idf=True,use_idf=True)
locals()["tfidf_transformer" + str(i+1)].fit(locals()["word_count_vector" + str(i+1)])
locals()["feature_names" + str(i+1)]=locals()["cv" + str(i+1)].get_feature_names()
locals()["doc" + str(i+1)]=str(locals()["docs" + str(i+1)][0:273787])
locals()["tf_idf_vector" + str(i+1)]=locals()["tfidf_transformer" + str(i+1)].transform(locals()["cv" + str(i+1)].transform([locals()["doc" + str(i+1)]]))
locals()["sorted_items" + str(i+1)]=sort_coo(locals()["tf_idf_vector" + str(i+1)].tocoo())
locals()["keywords" + str(i+1)]=extract_topn_from_vector(locals()["feature_names" + str(i+1)],locals()["sorted_items" + str(i+1)],100)
locals()["tuplex" + str(i+1)]=locals()["keywords" + str(i+1)].items()
locals()["tuple_listax" + str(i+1)]=list(locals()["tuplex" + str(i+1)])
locals()["tuple_listax" + str(i+1)]=sorted(locals()["tuple_listax" + str(i+1)], key = lambda x: x[1], reverse=True)
print("--- Tiempo de ejecucion fue de : %s seconds ---" % (time.time() - start_time))
--- Tiempo de ejecucion fue de : 0.06299901008605957 seconds ---
plt.figure(figsize=(45,25))
plt.subplot(341)
palabras = [a_tuple[0] for a_tuple in locals()["tuple_listax" + str(1)]]
frecuencias = [a_tuple[1] for a_tuple in locals()["tuple_listax" + str(1)]]
plt.bar(palabras[0:30],frecuencias[0:30])
plt.xticks(rotation=90)
plt.grid(b=None)
plt.ylabel('TF-IDF score')
plt.title('Bigrams de palabras TF-IDF Topic: '+str(1))
Text(0.5, 1.0, 'Bigrams de palabras TF-IDF Topic: 1')
plt.figure(figsize=(45,25))
plt.subplot(341)
palabras = [a_tuple[0] for a_tuple in locals()["tuple_listax" + str(2)]]
frecuencias = [a_tuple[1] for a_tuple in locals()["tuple_listax" + str(2)]]
plt.bar(palabras[0:30],frecuencias[0:30])
plt.xticks(rotation=90)
plt.grid(b=None)
plt.ylabel('TF-IDF score')
plt.title('Bigrams de palabras TF-IDF Topic: '+str(2))
Text(0.5, 1.0, 'Bigrams de palabras TF-IDF Topic: 2')
plt.figure(figsize=(45,25))
plt.subplot(341)
palabras = [a_tuple[0] for a_tuple in locals()["tuple_listax" + str(3)]]
frecuencias = [a_tuple[1] for a_tuple in locals()["tuple_listax" + str(3)]]
plt.bar(palabras[0:30],frecuencias[0:30])
plt.xticks(rotation=90)
plt.grid(b=None)
plt.ylabel('TF-IDF score')
plt.title('Bigrams de palabras TF-IDF Topic: '+str(3))
Text(0.5, 1.0, 'Bigrams de palabras TF-IDF Topic: 3')
plt.figure(figsize=(45,25))
plt.subplot(341)
palabras = [a_tuple[0] for a_tuple in locals()["tuple_listax" + str(4)]]
frecuencias = [a_tuple[1] for a_tuple in locals()["tuple_listax" + str(4)]]
plt.bar(palabras[0:30],frecuencias[0:30])
plt.xticks(rotation=90)
plt.grid(b=None)
plt.ylabel('TF-IDF score')
plt.title('Bigrams de palabras TF-IDF Topic: '+str(4))
Text(0.5, 1.0, 'Bigrams de palabras TF-IDF Topic: 4')
plt.figure(figsize=(45,25))
plt.subplot(341)
palabras = [a_tuple[0] for a_tuple in locals()["tuple_listax" + str(5)]]
frecuencias = [a_tuple[1] for a_tuple in locals()["tuple_listax" + str(5)]]
plt.bar(palabras[0:30],frecuencias[0:30])
plt.xticks(rotation=90)
plt.grid(b=None)
plt.ylabel('TF-IDF score')
plt.title('Bigrams de palabras TF-IDF Topic: '+str(5))
Text(0.5, 1.0, 'Bigrams de palabras TF-IDF Topic: 5')
plt.figure(figsize=(45,25))
plt.subplot(341)
palabras = [a_tuple[0] for a_tuple in locals()["tuple_listax" + str(6)]]
frecuencias = [a_tuple[1] for a_tuple in locals()["tuple_listax" + str(6)]]
plt.bar(palabras[0:30],frecuencias[0:30])
plt.xticks(rotation=90)
plt.grid(b=None)
plt.ylabel('TF-IDF score')
plt.title('Bigrams de palabras TF-IDF Topic: '+str(6))
Text(0.5, 1.0, 'Bigrams de palabras TF-IDF Topic: 6')
plt.figure(figsize=(45,25))
plt.subplot(341)
palabras = [a_tuple[0] for a_tuple in locals()["tuple_listax" + str(7)]]
frecuencias = [a_tuple[1] for a_tuple in locals()["tuple_listax" + str(7)]]
plt.bar(palabras[0:30],frecuencias[0:30])
plt.xticks(rotation=90)
plt.grid(b=None)
plt.ylabel('TF-IDF score')
plt.title('Bigrams de palabras TF-IDF Topic: '+str(7))
Text(0.5, 1.0, 'Bigrams de palabras TF-IDF Topic: 7')
plt.figure(figsize=(45,25))
plt.subplot(341)
palabras = [a_tuple[0] for a_tuple in locals()["tuple_listax" + str(8)]]
frecuencias = [a_tuple[1] for a_tuple in locals()["tuple_listax" + str(8)]]
plt.bar(palabras[0:30],frecuencias[0:30])
plt.xticks(rotation=90)
plt.grid(b=None)
plt.ylabel('TF-IDF score')
plt.title('Bigrams de palabras TF-IDF Topic: '+str(8))
Text(0.5, 1.0, 'Bigrams de palabras TF-IDF Topic: 8')
plt.figure(figsize=(45,25))
plt.subplot(341)
palabras = [a_tuple[0] for a_tuple in locals()["tuple_listax" + str(9)]]
frecuencias = [a_tuple[1] for a_tuple in locals()["tuple_listax" + str(9)]]
plt.bar(palabras[0:30],frecuencias[0:30])
plt.xticks(rotation=90)
plt.grid(b=None)
plt.ylabel('TF-IDF score')
plt.title('Bigrams de palabras TF-IDF Topic: '+str(9))
Text(0.5, 1.0, 'Bigrams de palabras TF-IDF Topic: 9')
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 11})
plt.figure(figsize=(50,50))
for i in range(9):
plt.subplot(3,4,i+1)
palabras = [a_tuple[0] for a_tuple in locals()["tuple_listax" + str(i+1)]]
frecuencias = [a_tuple[1] for a_tuple in locals()["tuple_listax" + str(i+1)]]
plt.bar(palabras[0:30],frecuencias[0:30])
plt.xticks(rotation=90)
plt.ylabel('TF-IDF score')
plt.title('Histograma Bigrams TF-IDF Topic: '+str(i+1))
from sklearn import metrics
import time
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
start_time = time.time()
def sort_coo(coo_matrix):
tuples = zip(coo_matrix.col, coo_matrix.data)
return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
"""Obtener los feature names y el tf-idf score de las top n palabras"""
#Para este caso se usa solo el top n de items para el vector
sorted_items = sorted_items[:topn]
score_vals = []
feature_vals = []
# Indice de palabras correspondiente al tf_idf score
for idx, score in sorted_items:
#Esto es para mantener el orden entre el feature y el score correspondiente
score_vals.append(round(score, 3))
feature_vals.append(feature_names[idx])
#Crear las tuplas de feature,score
#resultados = zip(feature_vals,score_vals)
results= {}
for idx in range(len(feature_vals)):
results[feature_vals[idx]]=score_vals[idx]
return results
for i in range(9):
locals()["df" + str(i+1)] = df_dominant_topic[df_dominant_topic['Dominant_Topic']==i+1]
locals()["docs" + str(i+1)]=locals()["df" + str(i+1)]['Text2'].tolist()
locals()["cv" + str(i+1)]=CountVectorizer(max_df=0.50,max_features=10000,ngram_range = (3,3))
locals()["word_count_vector" + str(i+1)]=locals()["cv" + str(i+1)].fit_transform(locals()["docs" + str(i+1)])
locals()["tfidf_transformer" + str(i+1)]=TfidfTransformer(smooth_idf=True,use_idf=True)
locals()["tfidf_transformer" + str(i+1)].fit(locals()["word_count_vector" + str(i+1)])
locals()["feature_names" + str(i+1)]=locals()["cv" + str(i+1)].get_feature_names()
locals()["doc" + str(i+1)]=str(locals()["docs" + str(i+1)][0:273787])
locals()["tf_idf_vector" + str(i+1)]=locals()["tfidf_transformer" + str(i+1)].transform(locals()["cv" + str(i+1)].transform([locals()["doc" + str(i+1)]]))
locals()["sorted_items" + str(i+1)]=sort_coo(locals()["tf_idf_vector" + str(i+1)].tocoo())
locals()["keywords" + str(i+1)]=extract_topn_from_vector(locals()["feature_names" + str(i+1)],locals()["sorted_items" + str(i+1)],100)
locals()["tuplex" + str(i+1)]=locals()["keywords" + str(i+1)].items()
locals()["tuple_listax" + str(i+1)]=list(locals()["tuplex" + str(i+1)])
locals()["tuple_listax" + str(i+1)]=sorted(locals()["tuple_listax" + str(i+1)], key = lambda x: x[1], reverse=True)
print("--- Tiempo de ejecucion fue de : %s seconds ---" % (time.time() - start_time))
--- Tiempo de ejecucion fue de : 0.04495716094970703 seconds ---
plt.figure(figsize=(45,25))
plt.subplot(341)
palabras = [a_tuple[0] for a_tuple in locals()["tuple_listax" + str(1)]]
frecuencias = [a_tuple[1] for a_tuple in locals()["tuple_listax" + str(1)]]
plt.bar(palabras[0:30],frecuencias[0:30])
plt.xticks(rotation=90)
plt.grid(b=None)
plt.ylabel('TF-IDF score')
plt.title('Trigrams de palabras TF-IDF Topic: '+str(1))
Text(0.5, 1.0, 'Trigrams de palabras TF-IDF Topic: 1')
plt.figure(figsize=(45,25))
plt.subplot(341)
palabras = [a_tuple[0] for a_tuple in locals()["tuple_listax" + str(2)]]
frecuencias = [a_tuple[1] for a_tuple in locals()["tuple_listax" + str(2)]]
plt.bar(palabras[0:30],frecuencias[0:30])
plt.xticks(rotation=90)
plt.grid(b=None)
plt.ylabel('TF-IDF score')
plt.title('Trigrams de palabras TF-IDF Topic: '+str(2))
Text(0.5, 1.0, 'Trigrams de palabras TF-IDF Topic: 2')
plt.figure(figsize=(45,25))
plt.subplot(341)
palabras = [a_tuple[0] for a_tuple in locals()["tuple_listax" + str(3)]]
frecuencias = [a_tuple[1] for a_tuple in locals()["tuple_listax" + str(3)]]
plt.bar(palabras[0:30],frecuencias[0:30])
plt.xticks(rotation=90)
plt.grid(b=None)
plt.ylabel('TF-IDF score')
plt.title('Trigrams de palabras TF-IDF Topic: '+str(3))
Text(0.5, 1.0, 'Trigrams de palabras TF-IDF Topic: 3')
plt.figure(figsize=(45,25))
plt.subplot(341)
palabras = [a_tuple[0] for a_tuple in locals()["tuple_listax" + str(4)]]
frecuencias = [a_tuple[1] for a_tuple in locals()["tuple_listax" + str(4)]]
plt.bar(palabras[0:30],frecuencias[0:30])
plt.xticks(rotation=90)
plt.grid(b=None)
plt.ylabel('TF-IDF score')
plt.title('Trigrams de palabras TF-IDF Topic: '+str(4))
Text(0.5, 1.0, 'Trigrams de palabras TF-IDF Topic: 4')
plt.figure(figsize=(45,25))
plt.subplot(341)
palabras = [a_tuple[0] for a_tuple in locals()["tuple_listax" + str(5)]]
frecuencias = [a_tuple[1] for a_tuple in locals()["tuple_listax" + str(5)]]
plt.bar(palabras[0:30],frecuencias[0:30])
plt.xticks(rotation=90)
plt.grid(b=None)
plt.ylabel('TF-IDF score')
plt.title('Trigrams de palabras TF-IDF Topic: '+str(5))
Text(0.5, 1.0, 'Trigrams de palabras TF-IDF Topic: 5')
plt.figure(figsize=(45,25))
plt.subplot(341)
palabras = [a_tuple[0] for a_tuple in locals()["tuple_listax" + str(6)]]
frecuencias = [a_tuple[1] for a_tuple in locals()["tuple_listax" + str(6)]]
plt.bar(palabras[0:30],frecuencias[0:30])
plt.xticks(rotation=90)
plt.grid(b=None)
plt.ylabel('TF-IDF score')
plt.title('Trigrams de palabras TF-IDF Topic: '+str(6))
Text(0.5, 1.0, 'Trigrams de palabras TF-IDF Topic: 6')
plt.figure(figsize=(45,25))
plt.subplot(341)
palabras = [a_tuple[0] for a_tuple in locals()["tuple_listax" + str(7)]]
frecuencias = [a_tuple[1] for a_tuple in locals()["tuple_listax" + str(7)]]
plt.bar(palabras[0:30],frecuencias[0:30])
plt.xticks(rotation=90)
plt.grid(b=None)
plt.ylabel('TF-IDF score')
plt.title('Trigrams de palabras TF-IDF Topic: '+str(7))
Text(0.5, 1.0, 'Trigrams de palabras TF-IDF Topic: 7')
plt.figure(figsize=(45,25))
plt.subplot(341)
palabras = [a_tuple[0] for a_tuple in locals()["tuple_listax" + str(8)]]
frecuencias = [a_tuple[1] for a_tuple in locals()["tuple_listax" + str(8)]]
plt.bar(palabras[0:30],frecuencias[0:30])
plt.xticks(rotation=90)
plt.grid(b=None)
plt.ylabel('TF-IDF score')
plt.title('Trigrams de palabras TF-IDF Topic: '+str(8))
Text(0.5, 1.0, 'Trigrams de palabras TF-IDF Topic: 8')
plt.figure(figsize=(45,25))
plt.subplot(341)
palabras = [a_tuple[0] for a_tuple in locals()["tuple_listax" + str(9)]]
frecuencias = [a_tuple[1] for a_tuple in locals()["tuple_listax" + str(9)]]
plt.bar(palabras[0:30],frecuencias[0:30])
plt.xticks(rotation=90)
plt.grid(b=None)
plt.ylabel('TF-IDF score')
plt.title('Trigrams de palabras TF-IDF Topic: '+str(9))
Text(0.5, 1.0, 'Trigrams de palabras TF-IDF Topic: 9')
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 28})
plt.figure(figsize=(120,150))
for i in range(9):
plt.subplot(3,4,i+1)
palabras = [a_tuple[0] for a_tuple in locals()["tuple_listax" + str(i+1)]]
frecuencias = [a_tuple[1] for a_tuple in locals()["tuple_listax" + str(i+1)]]
plt.bar(palabras[0:30],frecuencias[0:30])
plt.xticks(rotation=90)
plt.ylabel('TF-IDF score')
plt.title('Histograma Trigrams TF-IDF Topic: '+str(i+1))
lista_r = data['Abstract_sin_stopwords2']
len(conversionx)
68
import time
start_time = time.time()
# Packages to use
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import collections
import nltk
from nltk import bigrams
from nltk.corpus import stopwords
import re
import networkx as nx
import warnings
warnings.filterwarnings("ignore")
#Crear una lista con los elementos bigrams
terms_bigram = [list(bigrams(aj)) for aj in lista_r]
print("---El tiempo para obtener los bigrams es de: :%s seconds ---" % (time.time() - start_time))
---El tiempo para obtener los bigrams es de: :0.007999658584594727 seconds ---
#Hacer el merge de los bigrams
bigrams = list(itertools.chain(*terms_bigram))
#Contar cada bigram
bigram_conteos = collections.Counter(bigrams)
# Verifiquemos algunos de los bigrams mas importantes
bigram_conteos.most_common(38)
[(('soil', 'moisture'), 34), (('iot', 'iot'), 25), (('irrigation', 'management'), 22), (('irrigation', 'system'), 20), (('proposed', 'system'), 15), (('wireless', 'sensor'), 15), (('smart', 'irrigation'), 14), (('irrigation', 'systems'), 13), (('water', 'level'), 12), (('decision', 'support'), 11), (('rural', 'areas'), 11), (('iot', 'based'), 10), (('water', 'irrigation'), 10), (('automatic', 'irrigation'), 9), (('water', 'resources'), 9), (('using', 'iot'), 9), (('oil', 'palm'), 9), (('sensor', 'networks'), 8), (('water', 'management'), 8), (('agricultural', 'production'), 8), (('precision', 'agriculture'), 8), (('amount', 'water'), 8), (('weather', 'conditions'), 7), (('sensor', 'network'), 7), (('moisture', 'level'), 7), (('paper', 'presents'), 7), (('irrigation', 'scheduling'), 6), (('irrigation', 'decision'), 6), (('arethou5a', 'project'), 6), (('real', 'time'), 6), (('temperature', 'humidity'), 6), (('irrigation', 'process'), 6), (('management', 'system'), 6), (('mobile', 'application'), 6), (('iot', 'model'), 5), (('water', 'consumption'), 5), (('operations', 'management'), 5), (('water', 'usage'), 5)]
# Crear un datafram con los bigrams creados
bigram_data = pd.DataFrame(bigram_conteos.most_common(150),columns=['bigram', 'conteos'])
#bigram_data=bigram_data[0:50]
bigram_data.shape
(150, 2)
#Crear los conteos en un diccionario para cada bigrama
d = bigram_data.set_index('bigram').T.to_dict('records')
# Crear el elemento para la red
G = nx.Graph()
# Crear conexiones ntre nodos
for k, v in d[0].items():
G.add_edge(k[0], k[1], weight=(v * 5))
# Crear los objetos para el plot
fig, ax = plt.subplots(figsize=(20, 20))
pos = nx.spring_layout(G, k=2) # k=2 representa la Optimal distance entre nodos
# Crear el plot
nx.draw_networkx(G, pos,font_size=10,width=1,edge_color='black',node_color='yellow',with_labels = True,ax=ax)
#edge_color='black',node_color='blue',with_labels = False,ax=ax
# Ahora definimos los labels
for david, david1 in pos.items():
amir, amir1 = david1[0]+.135, david1[1]+.045
ax.text(amir, amir1,s=david,bbox=dict(facecolor='purple', alpha=0.10),horizontalalignment='center', fontsize=12)