How to do preprocessing steps like Stopword removal , punctuation removal , stemming and lemmatization in spaCy using python.
I have text data in csv file like paragraphs and sentences. I want to do text cleaning.
Kindly give example by loading csv in pandas dataframe
This may helps who is looking for answer for this quesion.
import spacy #load spacy
nlp = spacy.load("en", disable=['parser', 'tagger', 'ner'])
stops = stopwords.words("english")
def normalize(comment, lowercase, remove_stopwords):
if lowercase:
comment = comment.lower()
comment = nlp(comment)
lemmatized = list()
for word in comment:
lemma = word.lemma_.strip()
if lemma:
if not remove_stopwords or (remove_stopwords and lemma not in stops):
lemmatized.append(lemma)
return " ".join(lemmatized)
Data['Text_After_Clean'] = Data['Text'].apply(normalize, lowercase=True, remove_stopwords=True)