| #!/usr/bin/env python
|
| # coding: utf-8
|
|
|
| # In[12]:
|
|
|
|
|
| import nltk
|
| from nltk.tokenize import word_tokenize
|
| from nltk.corpus import stopwords
|
| from nltk.stem import WordNetLemmatizer, PorterStemmer
|
| from nltk.probability import FreqDist
|
| from nltk.tokenize import sent_tokenize
|
| import string
|
|
|
| #nltk.download('punkt')
|
| #nltk.download('stopwords')
|
| #nltk.download('wordnet')
|
|
|
| text = "Natural Language Processing (NLP) is a subfield of artificial intelligence and linguistics. It involves the interaction between computers and humans in natural language. NLP focuses on the interaction between computers and humans, as well as the processing and analysis of large amounts of natural language data."
|
| print("The input text is: \n" + text)
|
|
|
| tokens = word_tokenize(text)
|
| print("\nThe tokens generated after tokenisation are: \n")
|
| print(tokens)
|
|
|
| table = str.maketrans('', '', string.punctuation)
|
| stripped_tokens = [word for word in tokens if word.translate(table).strip()]
|
| print("\nThe tokens generated after removing punctuation are: \n")
|
| print(stripped_tokens)
|
|
|
| stop_words = set(stopwords.words('english'))
|
| filtered_tokens = [word for word in stripped_tokens if word.lower() not in stop_words]
|
| print("\nThe tokens after removing stopwords are: \n")
|
| print(filtered_tokens)
|
|
|
| stemmer = PorterStemmer()
|
| stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
|
| print("\nThe tokens after stemming are: \n")
|
| print(stemmed_tokens)
|
|
|
| lemmatizer = WordNetLemmatizer()
|
| lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
|
| print("\nThe tokens after lemmatization are: \n")
|
| print(lemmatized_tokens)
|
|
|
| freq_dist = FreqDist(lemmatized_tokens)
|
| print("\nFrequency Distribution: \n")
|
| print(freq_dist.most_common(5))
|
|
|
| sentences = sent_tokenize(text)
|
| print("\nSentences: \n")
|
| for sentence in sentences:
|
| print(sentence)
|
|
|
|
|
| # In[ ]:
|