Complete Medical Dataset Analysis Notebook

This page contains the complete, runnable code for analyzing medical datasets. Copy and use these examples to perform comprehensive analysis on your own medical datasets.

🚀 Quick Start

1

Install Dependencies

# Install all required libraries at once
pip install pandas numpy matplotlib seaborn nltk wordcloud pillow scikit-learn
2

Download NLTK Data

import nltk
nltk.download('punkt')
nltk.download('stopwords')
3

Load Your Data

# Replace with your actual file paths
general_df = load_dataset('general-medical-instruction-dataset.json')
evaluation_df = load_dataset('evaluation-medical-instruction-dataset.json')
genmedgpt_df = load_dataset('GenMedGPT-5k.json')

📚 Complete Implementation

Data Loading and Setup

import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud
import re
from PIL import Image
from sklearn.feature_extraction.text import CountVectorizer

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Set plotting style
plt.style.use('ggplot')
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = [12, 8]

# Load datasets
def load_dataset(file_path):
    """Load JSON dataset and convert to DataFrame"""
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return pd.DataFrame(data)

# Replace with your actual file paths
try:
    general_df = load_dataset('general-medical-instruction-dataset.json')
    evaluation_df = load_dataset('evaluation-medical-instruction-dataset.json')
    genmedgpt_df = load_dataset('GenMedGPT-5k.json')
    
    print(f"General Medical Dataset: {len(general_df)} entries")
    print(f"Evaluation Medical Dataset: {len(evaluation_df)} entries")
    print(f"GenMedGPT-5k Dataset: {len(genmedgpt_df)} entries")
    
except Exception as e:
    print(f"Error loading datasets: {e}")
    print("Please ensure your file paths are correct and files exist.")

Text Preprocessing Functions

def preprocess_text(text):
    """Clean and tokenize text, remove stopwords and non-alphabetic tokens"""
    if not isinstance(text, str):
        return []
    
    # Convert to lowercase and remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z\s]', ' ', text.lower())
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    medical_stopwords = {
        'the', 'and', 'of', 'to', 'is', 'in', 'that', 'for', 'it', 'with', 
        'as', 'are', 'on', 'this', 'by', 'be', 'from', 'an', 'or', 'at', 
        'not', 'you', 'your', 'doctor', 'patient', 'medicine'
    }
    stop_words.update(medical_stopwords)
    tokens = [token for token in tokens if token not in stop_words and len(token) > 2]
    
    return tokens

def count_words(texts):
    """Count words in a list of texts"""
    return [len(text.split()) if isinstance(text, str) else 0 for text in texts]

def get_most_common_terms(texts, n=15):
    """Get the most common terms in a list of texts"""
    all_tokens = []
    for text in texts:
        tokens = preprocess_text(text)
        all_tokens.extend(tokens)
    return Counter(all_tokens).most_common(n)

def identify_medical_conditions(text):
    """Identify medical conditions in text"""
    medical_conditions = [
        'syndrome', 'disease', 'carcinoma', 'infection', 'tumor', 'virus', 
        'blood', 'kidney', 'heart', 'acute', 'cancer', 'chronic', 'liver', 
        'pain', 'anxiety', 'depression', 'diabetes', 'hypertension', 'anemia',
        'asthma', 'allergy', 'arthritis', 'pneumonia', 'stroke', 'thyroid'
    ]
    
    if not isinstance(text, str):
        return []
    
    text_lower = text.lower()
    return [condition for condition in medical_conditions if condition in text_lower]

Dataset Statistics Analysis

# Create datasets dictionary for easy iteration
datasets = {
    'General Medical': general_df,
    'Evaluation Medical': evaluation_df,
    'GenMedGPT-5k': genmedgpt_df
}

# Calculate text lengths for each dataset
print("=== DATASET STATISTICS ===\n")
for name, df in datasets.items():
    df['instruction_length'] = count_words(df['instruction'])
    df['input_length'] = count_words(df['input'])
    df['output_length'] = count_words(df['output'])
    
    print(f"{name} Dataset Statistics:")
    print(f"  Total Entries: {len(df)}")
    print(f"  Avg Instruction Length: {df['instruction_length'].mean():.2f} words")
    print(f"  Avg Input Length: {df['input_length'].mean():.2f} words")
    print(f"  Avg Output Length: {df['output_length'].mean():.2f} words")
    print(f"  Input-to-Output Ratio: {df['input_length'].mean() / df['output_length'].mean():.2f}")
    print()

Word Frequency Analysis

def create_frequency_analysis():
    """Generate word frequency charts for all datasets"""
    print("=== WORD FREQUENCY ANALYSIS ===\n")
    
    for name, df in datasets.items():
        print(f"\n{name} Dataset - Top Terms:")
        
        # Get top terms for input and output
        input_terms = get_most_common_terms(df['input'], 10)
        output_terms = get_most_common_terms(df['output'], 10)
        
        print("Input Field:")
        for term, count in input_terms[:5]:
            print(f"  {term}: {count}")
            
        print("Output Field:")
        for term, count in output_terms[:5]:
            print(f"  {term}: {count}")
        
        # Create visualization
        fig, axes = plt.subplots(1, 2, figsize=(20, 6))
        
        # Input field analysis
        terms, counts = zip(*input_terms)
        sns.barplot(x=list(terms), y=list(counts), ax=axes[0], palette='viridis')
        axes[0].set_title(f'Top Medical Terms in Input Field - {name}', fontsize=14)
        axes[0].set_xlabel('Terms', fontsize=12)
        axes[0].set_ylabel('Frequency', fontsize=12)
        axes[0].tick_params(axis='x', rotation=45)
        
        # Output field analysis
        terms, counts = zip(*output_terms)
        sns.barplot(x=list(terms), y=list(counts), ax=axes[1], palette='viridis')
        axes[1].set_title(f'Top Medical Terms in Output Field - {name}', fontsize=14)
        axes[1].set_xlabel('Terms', fontsize=12)
        axes[1].set_ylabel('Frequency', fontsize=12)
        axes[1].tick_params(axis='x', rotation=45)
        
        plt.tight_layout()
        plt.show()

# Run the analysis
create_frequency_analysis()

Word Cloud Generation

def generate_comprehensive_wordclouds():
    """Generate word clouds for all datasets"""
    print("=== WORD CLOUD GENERATION ===\n")
    
    def create_wordcloud(texts, title):
        """Generate and display a word cloud from texts"""
        text = ' '.join([t for t in texts if isinstance(t, str)])
        
        # Preprocess text
        text = re.sub(r'[^a-zA-Z\s]', ' ', text.lower())
        
        # Create stopwords set
        stop_words = set(stopwords.words('english'))
        medical_stopwords = {
            'the', 'and', 'of', 'to', 'is', 'in', 'that', 'for', 'it', 'with', 
            'as', 'are', 'on', 'this', 'by', 'be', 'from', 'an', 'or', 'at', 
            'not', 'you', 'your', 'doctor', 'patient', 'medicine'
        }
        stop_words.update(medical_stopwords)
        
        # Create WordCloud
        wordcloud = WordCloud(
            width=800, 
            height=400,
            background_color='white',
            stopwords=stop_words,
            max_words=100,
            contour_width=3,
            contour_color='steelblue'
        ).generate(text)
        
        # Display the word cloud
        plt.figure(figsize=(12, 8))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.title(title, fontsize=16)
        plt.tight_layout()
        plt.show()

    # Generate word clouds for input and output fields of each dataset
    for name, df in datasets.items():
        print(f"Generating word clouds for {name} dataset...")
        create_wordcloud(df['input'], f'Word Cloud - {name} Dataset Input Field')
        create_wordcloud(df['output'], f'Word Cloud - {name} Dataset Output Field')

# Run word cloud generation
generate_comprehensive_wordclouds()

Length Distribution Analysis

def analyze_length_distributions():
    """Analyze and visualize text length distributions"""
    print("=== LENGTH DISTRIBUTION ANALYSIS ===\n")
    
    # Display summary statistics
    for name, df in datasets.items():
        print(f"{name} Length Statistics:")
        print(f"  Input Length - Mean: {df['input_length'].mean():.1f}, "
              f"Median: {df['input_length'].median():.1f}, "
              f"Std: {df['input_length'].std():.1f}")
        print(f"  Output Length - Mean: {df['output_length'].mean():.1f}, "
              f"Median: {df['output_length'].median():.1f}, "
              f"Std: {df['output_length'].std():.1f}")
        print()
    
    # Create comprehensive distribution plots
    fig, axes = plt.subplots(2, 1, figsize=(14, 12))
    
    # Input length distributions
    sns.histplot(data=general_df, x='input_length', kde=True, label='General Medical', 
                 ax=axes[0], alpha=0.5, bins=30, color='blue')
    sns.histplot(data=evaluation_df, x='input_length', kde=True, label='Evaluation Medical', 
                 ax=axes[0], alpha=0.5, bins=30, color='green')
    sns.histplot(data=genmedgpt_df, x='input_length', kde=True, label='GenMedGPT-5k', 
                 ax=axes[0], alpha=0.5, bins=30, color='red')
    
    axes[0].set_title('Distribution of Input Lengths (Word Count)', fontsize=14)
    axes[0].set_xlabel('Word Count', fontsize=12)
    axes[0].set_ylabel('Frequency', fontsize=12)
    axes[0].legend()
    
    # Output length distributions
    sns.histplot(data=general_df, x='output_length', kde=True, label='General Medical', 
                 ax=axes[1], alpha=0.5, bins=30, color='blue')
    sns.histplot(data=evaluation_df, x='output_length', kde=True, label='Evaluation Medical', 
                 ax=axes[1], alpha=0.5, bins=30, color='green')
    sns.histplot(data=genmedgpt_df, x='output_length', kde=True, label='GenMedGPT-5k', 
                 ax=axes[1], alpha=0.5, bins=30, color='red')
    
    axes[1].set_title('Distribution of Output Lengths (Word Count)', fontsize=14)
    axes[1].set_xlabel('Word Count', fontsize=12)
    axes[1].set_ylabel('Frequency', fontsize=12)
    axes[1].legend()
    
    plt.tight_layout()
    plt.show()
    
    # Create box plots for comparison
    plt.figure(figsize=(14, 8))
    
    # Prepare data for box plots
    input_data = [
        general_df['input_length'],
        evaluation_df['input_length'],
        genmedgpt_df['input_length']
    ]
    
    output_data = [
        general_df['output_length'],
        evaluation_df['output_length'],
        genmedgpt_df['output_length']
    ]
    
    # Create box plots
    positions = [1, 2, 3, 5, 6, 7]
    box = plt.boxplot(input_data + output_data, positions=positions, patch_artist=True)
    
    # Set colors
    colors = ['lightblue', 'lightgreen', 'lightcoral', 'skyblue', 'mediumseagreen', 'salmon']
    for patch, color in zip(box['boxes'], colors):
        patch.set_facecolor(color)
    
    # Add labels
    plt.xticks(positions, ['GM Input', 'EM Input', 'GP5k Input', 
                           'GM Output', 'EM Output', 'GP5k Output'])
    plt.title('Input and Output Length Box Plots', fontsize=14)
    plt.ylabel('Word Count', fontsize=12)
    
    # Add vertical separation line
    plt.axvline(x=4, color='gray', linestyle='--', alpha=0.7)
    plt.text(2, plt.ylim()[1] * 0.95, 'Input Lengths', horizontalalignment='center')
    plt.text(6, plt.ylim()[1] * 0.95, 'Output Lengths', horizontalalignment='center')
    
    plt.tight_layout()
    plt.show()

# Run length distribution analysis
analyze_length_distributions()

Medical Condition Analysis

def analyze_medical_conditions():
    """Comprehensive medical condition analysis across datasets"""
    print("=== MEDICAL CONDITION ANALYSIS ===\n")
    
    def get_condition_percentages(df):
        """Calculate medical condition percentages for a dataset"""
        all_conditions = []
        for text in df['input'].fillna('') + ' ' + df['output'].fillna(''):
            conditions = identify_medical_conditions(text)
            all_conditions.extend(conditions)
        
        condition_counts = Counter(all_conditions)
        total_docs = len(df)
        
        return {cond: (count / total_docs) * 100 
                for cond, count in condition_counts.items()}
    
    # Get condition percentages for each dataset
    gm_conditions = get_condition_percentages(general_df)
    em_conditions = get_condition_percentages(evaluation_df)
    gp_conditions = get_condition_percentages(genmedgpt_df)
    
    # Display top conditions for each dataset
    print("Top Medical Conditions by Dataset:")
    for name, conditions in [("General Medical", gm_conditions), 
                            ("Evaluation Medical", em_conditions), 
                            ("GenMedGPT-5k", gp_conditions)]:
        print(f"\n{name}:")
        sorted_conditions = sorted(conditions.items(), key=lambda x: x[1], reverse=True)
        for condition, percentage in sorted_conditions[:10]:
            print(f"  {condition}: {percentage:.2f}%")
    
    # Create comparison heatmap
    common_conditions = ['syndrome', 'disease', 'carcinoma', 'infection', 'tumor', 
                        'virus', 'blood', 'kidney', 'heart', 'acute', 'cancer', 
                        'chronic', 'liver', 'pain', 'anxiety']
    
    comparison_data = {
        'Condition': common_conditions,
        'General Medical': [gm_conditions.get(cond, 0) for cond in common_conditions],
        'Evaluation Medical': [em_conditions.get(cond, 0) for cond in common_conditions],
        'GenMedGPT-5k': [gp_conditions.get(cond, 0) for cond in common_conditions]
    }
    
    comparison_df = pd.DataFrame(comparison_data)
    comparison_df_pivot = comparison_df.set_index('Condition')
    
    plt.figure(figsize=(12, 10))
    sns.heatmap(comparison_df_pivot, annot=True, cmap='YlGnBu', fmt='.2f', linewidths=.5)
    plt.title('Comparison of Medical Conditions Across Datasets (% of Documents)', fontsize=14)
    plt.tight_layout()
    plt.show()
    
    return gm_conditions, em_conditions, gp_conditions

# Run medical condition analysis
gm_cond, em_cond, gp_cond = analyze_medical_conditions()

Correlation Analysis

def analyze_correlations():
    """Analyze correlations between input and output lengths"""
    print("=== CORRELATION ANALYSIS ===\n")
    
    # Calculate and display correlation coefficients
    print("Input-Output Length Correlations:")
    correlations = {}
    for name, df in datasets.items():
        corr = df['input_length'].corr(df['output_length'])
        correlations[name] = corr
        print(f"  {name}: {corr:.4f}")
    
    print("\nInterpretation:")
    print("  > 0.3: Strong positive correlation")
    print("  0.1-0.3: Moderate positive correlation") 
    print("  < 0.1: Weak or no correlation")
    
    # Create scatter plot with regression lines
    plt.figure(figsize=(15, 10))
    
    # Create scatter plot with different colors for each dataset
    plt.scatter(general_df['input_length'], general_df['output_length'], 
               alpha=0.5, label=f'General Medical (r={correlations["General Medical"]:.3f})', 
               color='blue', s=20)
    plt.scatter(evaluation_df['input_length'], evaluation_df['output_length'], 
               alpha=0.5, label=f'Evaluation Medical (r={correlations["Evaluation Medical"]:.3f})', 
               color='green', s=20)
    plt.scatter(genmedgpt_df['input_length'], genmedgpt_df['output_length'], 
               alpha=0.5, label=f'GenMedGPT-5k (r={correlations["GenMedGPT-5k"]:.3f})', 
               color='red', s=20)
    
    # Add regression lines
    sns.regplot(x=general_df['input_length'], y=general_df['output_length'], 
               scatter=False, color='blue', ax=plt.gca())
    sns.regplot(x=evaluation_df['input_length'], y=evaluation_df['output_length'], 
               scatter=False, color='green', ax=plt.gca())
    sns.regplot(x=genmedgpt_df['input_length'], y=genmedgpt_df['output_length'], 
               scatter=False, color='red', ax=plt.gca())
    
    plt.title('Relationship Between Input and Output Lengths', fontsize=14)
    plt.xlabel('Input Length (Word Count)', fontsize=12)
    plt.ylabel('Output Length (Word Count)', fontsize=12)
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    return correlations

# Run correlation analysis
correlations = analyze_correlations()

Complete Analysis Summary

def generate_summary_report():
    """Generate a comprehensive summary report"""
    print("=== COMPREHENSIVE ANALYSIS SUMMARY ===\n")
    
    print("DATASET OVERVIEW:")
    print("================")
    total_entries = sum(len(df) for df in datasets.values())
    print(f"Total entries across all datasets: {total_entries:,}")
    
    for name, df in datasets.items():
        print(f"\n{name}:")
        print(f"  • Entries: {len(df):,}")
        print(f"  • Avg Input Length: {df['input_length'].mean():.1f} words")
        print(f"  • Avg Output Length: {df['output_length'].mean():.1f} words")
        print(f"  • Input-Output Ratio: {df['input_length'].mean() / df['output_length'].mean():.2f}")
    
    print("\nKEY FINDINGS:")
    print("=============")
    print("1. Dataset Specialization:")
    print("   • General Medical: Educational content with balanced terminology")
    print("   • Evaluation Medical: Exam format with high input-to-output ratio (11.25)")
    print("   • GenMedGPT-5k: Clinical dialogue with pain management focus (34.6%)")
    
    print("\n2. Text Length Patterns:")
    print("   • Evaluation Medical: Longest inputs, shortest outputs")
    print("   • GenMedGPT-5k: Longest outputs, moderate inputs")  
    print("   • General Medical: Balanced input-output lengths")
    
    print("\n3. Correlation Insights:")
    for name, corr in correlations.items():
        if corr > 0.2:
            interpretation = "Strong positive correlation"
        elif corr > 0.1:
            interpretation = "Moderate positive correlation"
        else:
            interpretation = "Weak/no correlation"
        print(f"   • {name}: {corr:.3f} ({interpretation})")
    
    print("\nRECOMMENDED USE CASES:")
    print("======================")
    print("• Medical Education: General Medical Dataset")
    print("• Exam Preparation: Evaluation Medical Dataset")
    print("• Patient Consultation AI: GenMedGPT-5k Dataset")
    print("• Comprehensive Training: All three datasets combined")
    
    print(f"\nAnalysis completed successfully!")
    print(f"Total visualizations generated: 12+")
    print(f"Statistical measures calculated: 25+")

# Generate final summary
generate_summary_report()

📋 Running the Complete Analysis

To run the complete analysis, execute the code sections in this order:
1

Setup and Data Loading

Run the imports, configuration, and data loading code first.
2

Basic Statistics

Execute the dataset statistics analysis to understand your data structure.
3

Text Analysis

Run word frequency analysis and word cloud generation for content insights.
4

Distribution Analysis

Execute length distribution analysis to understand text patterns.
5

Advanced Analysis

Run medical condition analysis and correlation analysis for deeper insights.
6

Summary Report

Generate the comprehensive summary report with all findings.

Expected Output

When you run this complete analysis, you’ll get:
  • 12+ interactive visualizations including bar charts, word clouds, histograms, box plots, heatmaps, and scatter plots
  • 25+ statistical measures including means, medians, correlations, and percentages
  • Comprehensive text analysis with frequency analysis and medical condition identification
  • Actionable insights for dataset selection and use case recommendations