def preprocess_text(text): """Clean and tokenize text, remove stopwords""" if not isinstance(text, str): return [] # Convert to lowercase and remove non-alphabetic characters text = re.sub(r'[^a-zA-Z\s]', ' ', text.lower()) # Tokenize tokens = word_tokenize(text) # Remove stopwords stop_words = set(stopwords.words('english')) medical_stopwords = {'the', 'and', 'of', 'to', 'is', 'in', 'doctor', 'patient'} stop_words.update(medical_stopwords) tokens = [token for token in tokens if token not in stop_words and len(token) > 2] return tokens
2
Extract Common Terms
Copy
def get_most_common_terms(texts, n=15): """Get the most common terms in a list of texts""" all_tokens = [] for text in texts: tokens = preprocess_text(text) all_tokens.extend(tokens) return Counter(all_tokens).most_common(n)