-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
60 lines (44 loc) · 1.93 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import nltk
import re # library for regular expression operations
import string # for string operations
from nltk.corpus import stopwords # module for stop words that come with NLTK
from nltk.stem import PorterStemmer # module for stemming
from nltk.tokenize import TweetTokenizer # module for tokenizing strings
def process_tweet(tweet):
"""
Take in a tweet and process it, by:
- Removing links and tweet tags
- Tokenizing
- Removing stopwords
- Stemming
Returns string with processed Tweet
"""
# Download the stopwords from NLTK
nltk.download('stopwords', quiet=True)
# Remove old style retweet text "RT"
tweet2 = re.sub(r'^RT[\s]+', '', tweet)
# Remove hyperlinks
tweet2 = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet2)
# Remove hashtags, only removing the hash # sign from the word
tweet2 = re.sub(r'#', '', tweet2)
# Instantiate tokenizer class
tokenizer = TweetTokenizer(preserve_case=False,
strip_handles=True,
reduce_len=True)
# Tokenize tweets
tweet_tokens = tokenizer.tokenize(tweet2)
# Import the english stop words list from NLTK
stopwords_english = stopwords.words('english')
tweets_clean = []
for word in tweet_tokens: # Go through every word in your tokens list
if (word not in stopwords_english and # remove stopwords
word not in string.punctuation): # remove punctuation
tweets_clean.append(word)
# Instantiate stemming class
stemmer = PorterStemmer()
# Create an empty list to store the stems
tweets_stem = []
for word in tweets_clean:
stem_word = stemmer.stem(word) # Stemming word
tweets_stem.append(stem_word) # Append to the list
return ' '.join(tweets_stem)