# data wrangling
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
# prediction
from nametrace import NameTracer
# visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
pio.renderers.default = "notebook"

df = pd.read_csv("repec_5p_names.csv")
print(df.head())

                  name
0       Daron Acemoglu
1     Muhammad Shahbaz
2  Yuriy Gorodnichenko
3         Rangan Gupta
4            John List

nt = NameTracer()

result = nt.predict("Will Smith")
print(result)

{'is_human': True, 'gender': 'male', 'subregion': 'Northern America', 'confidence': {'human': 1.0, 'gender': 0.9337253570556641, 'subregion': 0.39760878682136536}}

def predict(name):
    result = nt.predict(name)
    return result['gender'],result['subregion']

df[['gender', 'subregion']] = df['name'].progress_apply(lambda x: pd.Series(predict(x)))
print(df.head())

100%|██████████| 4110/4110 [00:17<00:00, 228.57it/s]

                  name gender         subregion
0       Daron Acemoglu   male    Eastern Europe
1     Muhammad Shahbaz   male     Southern Asia
2  Yuriy Gorodnichenko   male    Eastern Europe
3         Rangan Gupta   male     Southern Asia
4            John List   male  Northern America

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Gender Distribution Visualization
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Pie chart for gender
gender_counts = df.gender.value_counts()
colors = ['#ff9999', '#66b3ff']
wedges, texts, autotexts = ax1.pie(gender_counts.values, labels=gender_counts.index, 
                                   autopct='%1.1f%%', colors=colors, startangle=90)
ax1.set_title('Gender Distribution', fontsize=16, fontweight='bold')

# Bar chart for gender with counts
bars = ax2.bar(gender_counts.index, gender_counts.values, color=colors, alpha=0.8)
ax2.set_title('Gender Distribution (Count)', fontsize=16, fontweight='bold')
ax2.set_ylabel('Count', fontsize=12)
ax2.set_xlabel('Gender', fontsize=12)

# Add count labels on bars
for bar in bars:
    height = bar.get_height()
    ax2.text(bar.get_x() + bar.get_width()/2., height + 20,
             f'{int(height)}', ha='center', va='bottom', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()

# World Map of Subregion Distribution
# Create a mapping from subregions to representative countries for visualization
subregion_to_countries = {
    'Northern America': ['USA', 'CAN'],
    'Western Europe': ['DEU', 'FRA', 'NLD', 'BEL', 'CHE', 'AUT'],
    'Southern Europe': ['ITA', 'ESP', 'GRC', 'PRT', 'HRV', 'SVN', 'MKD', 'ALB', 'MNE', 'SRB'],
    'Northern Europe': ['GBR', 'IRL', 'DNK', 'SWE', 'NOR', 'FIN', 'ISL', 'EST', 'LVA', 'LTU'],
    'Eastern Asia': ['CHN', 'JPN', 'KOR', 'PRK', 'MNG'],
    'Southern Asia': ['IND', 'PAK', 'BGD', 'LKA', 'NPL', 'BTN', 'AFG', 'MDV'],
    'Eastern Europe': ['RUS', 'POL', 'UKR', 'CZE', 'SVK', 'HUN', 'ROU', 'BGR', 'BLR', 'MDA'],
    'South America': ['BRA', 'ARG', 'CHL', 'PER', 'COL', 'VEN', 'ECU', 'BOL', 'PRY', 'URY', 'GUY', 'SUR'],
    'South-eastern Asia': ['IDN', 'THA', 'VNM', 'PHL', 'MYS', 'SGP', 'MMR', 'KHM', 'LAO', 'BRN', 'TLS'],
    'Western Asia': ['TUR', 'IRN', 'IRQ', 'SAU', 'SYR', 'JOR', 'ISR', 'LBN', 'ARE', 'KWT', 'QAT', 'BHR', 'OMN', 'YEM'],
    'Central America': ['MEX', 'GTM', 'CUB', 'HND', 'NIC', 'CRI', 'PAN', 'SLV', 'BLZ'],
    'Western Africa': ['NGA', 'GHA', 'CIV', 'MLI', 'BFA', 'NER', 'SEN', 'GIN', 'SLE', 'LBR', 'TGO', 'BEN', 'MRT', 'GMB', 'GNB', 'CPV'],
    'Northern Africa': ['EGY', 'LBY', 'TUN', 'DZA', 'MAR', 'SDN'],
    'Eastern Africa': ['ETH', 'KEN', 'UGA', 'TZA', 'RWA', 'BDI', 'SOM', 'ERI', 'DJI', 'COM', 'MUS', 'SYC', 'MDG'],
    'Middle Africa': ['COD', 'AGO', 'CMR', 'TCD', 'CAF', 'COG', 'GAB', 'GNQ', 'STP']
}

# Create data for the map
map_data = []
subregion_counts = df.subregion.value_counts()

for subregion, countries in subregion_to_countries.items():
    count = subregion_counts.get(subregion, 0)
    percentage = count / len(df) * 100 if count > 0 else 0
    
    for country in countries:
        map_data.append({
            'country': country,
            'subregion': subregion,
            'count': count,
            'percentage': percentage
        })

map_df = pd.DataFrame(map_data)

# Create the choropleth map
fig = px.choropleth(
    map_df,
    locations='country',
    color='percentage',
    hover_name='subregion',
    hover_data={'count': True, 'percentage': ':.1f%'},
    color_continuous_scale='Viridis',
    title='Geographic Distribution of Names by Subregion<br><sub>Color intensity represents relative occurrence (%)</sub>',
    labels={'percentage': 'Percentage (%)', 'count': 'Count'}
)

fig.update_layout(
    title_font_size=16,
    title_x=0.5,
    geo=dict(
        showframe=False,
        showcoastlines=True,
        projection_type='equirectangular'
    ),
    coloraxis_colorbar=dict(
        title="Percentage (%)",
        title_font_size=12
    ),
    width=1000,
    height=600
)

fig.show()