Un snippet intéressant trouvé sur Stackoverflow et qui comprend notamment les émojis :
import emoji
import spacy
import stop_words
nlp = spacy.load('en_core_web_sm')
stopwords = [w.lower() for w in stop_words.get_stop_words('en')]
emoticon_string = r"""
(?:
[<>]?
[:;=8] # eyes
[\-o\*\']? # optional nose
[\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
|
[\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
[\-o\*\']? # optional nose
[:;=8] # eyes
[<>]?
)"""
def give_emoji_free_text(text):
return emoji.get_emoji_regexp().sub(r'', text)
def sanitize(string):
""" Sanitize one string """
# remove graphical emoji
string = give_emoji_free_text(string)
# remove textual emoji
string = re.sub(emoticon_string,'',string)
# normalize to lowercase
string = string.lower()
# spacy tokenizer
string_split = [token.text for token in nlp(string)]
# in case the string is empty
if not string_split:
return ''
# join back to string
string = ' '.join(string_split)
# remove user
# assuming user has @ in front
string = re.sub(r"""(?:@[\w_]+)""",'',string)
#remove # and @
for punc in '":!@#':
string = string.replace(punc, '')
# remove 't.co/' links
string = re.sub(r'http//t.co\/[^\s]+', '', string, flags=re.MULTILINE)
# removing stop words
string = ' '.join([w for w in string.split() if w not in stopwords])
return string