In [37]:
# data wrangling
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
# prediction
from nametrace import NameTracer
# visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
pio.renderers.default = "notebook"
In [38]:
df = pd.read_csv("repec_5p_names.csv")
print(df.head())
name 0 Daron Acemoglu 1 Muhammad Shahbaz 2 Yuriy Gorodnichenko 3 Rangan Gupta 4 John List
In [39]:
nt = NameTracer()
In [45]:
result = nt.predict("Will Smith")
print(result)
{'is_human': True, 'gender': 'male', 'subregion': 'Northern America', 'confidence': {'human': 1.0, 'gender': 0.9337253570556641, 'subregion': 0.39760878682136536}}
In [46]:
def predict(name):
result = nt.predict(name)
return result['gender'],result['subregion']
In [51]:
df[['gender', 'subregion']] = df['name'].progress_apply(lambda x: pd.Series(predict(x)))
print(df.head())
100%|██████████| 4110/4110 [00:17<00:00, 228.57it/s]
name gender subregion 0 Daron Acemoglu male Eastern Europe 1 Muhammad Shahbaz male Southern Asia 2 Yuriy Gorodnichenko male Eastern Europe 3 Rangan Gupta male Southern Asia 4 John List male Northern America
In [59]:
# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
In [61]:
# Gender Distribution Visualization
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
# Pie chart for gender
gender_counts = df.gender.value_counts()
colors = ['#ff9999', '#66b3ff']
wedges, texts, autotexts = ax1.pie(gender_counts.values, labels=gender_counts.index,
autopct='%1.1f%%', colors=colors, startangle=90)
ax1.set_title('Gender Distribution', fontsize=16, fontweight='bold')
# Bar chart for gender with counts
bars = ax2.bar(gender_counts.index, gender_counts.values, color=colors, alpha=0.8)
ax2.set_title('Gender Distribution (Count)', fontsize=16, fontweight='bold')
ax2.set_ylabel('Count', fontsize=12)
ax2.set_xlabel('Gender', fontsize=12)
# Add count labels on bars
for bar in bars:
height = bar.get_height()
ax2.text(bar.get_x() + bar.get_width()/2., height + 20,
f'{int(height)}', ha='center', va='bottom', fontsize=12, fontweight='bold')
plt.tight_layout()
plt.show()
In [ ]:
# World Map of Subregion Distribution
# Create a mapping from subregions to representative countries for visualization
subregion_to_countries = {
'Northern America': ['USA', 'CAN'],
'Western Europe': ['DEU', 'FRA', 'NLD', 'BEL', 'CHE', 'AUT'],
'Southern Europe': ['ITA', 'ESP', 'GRC', 'PRT', 'HRV', 'SVN', 'MKD', 'ALB', 'MNE', 'SRB'],
'Northern Europe': ['GBR', 'IRL', 'DNK', 'SWE', 'NOR', 'FIN', 'ISL', 'EST', 'LVA', 'LTU'],
'Eastern Asia': ['CHN', 'JPN', 'KOR', 'PRK', 'MNG'],
'Southern Asia': ['IND', 'PAK', 'BGD', 'LKA', 'NPL', 'BTN', 'AFG', 'MDV'],
'Eastern Europe': ['RUS', 'POL', 'UKR', 'CZE', 'SVK', 'HUN', 'ROU', 'BGR', 'BLR', 'MDA'],
'South America': ['BRA', 'ARG', 'CHL', 'PER', 'COL', 'VEN', 'ECU', 'BOL', 'PRY', 'URY', 'GUY', 'SUR'],
'South-eastern Asia': ['IDN', 'THA', 'VNM', 'PHL', 'MYS', 'SGP', 'MMR', 'KHM', 'LAO', 'BRN', 'TLS'],
'Western Asia': ['TUR', 'IRN', 'IRQ', 'SAU', 'SYR', 'JOR', 'ISR', 'LBN', 'ARE', 'KWT', 'QAT', 'BHR', 'OMN', 'YEM'],
'Central America': ['MEX', 'GTM', 'CUB', 'HND', 'NIC', 'CRI', 'PAN', 'SLV', 'BLZ'],
'Western Africa': ['NGA', 'GHA', 'CIV', 'MLI', 'BFA', 'NER', 'SEN', 'GIN', 'SLE', 'LBR', 'TGO', 'BEN', 'MRT', 'GMB', 'GNB', 'CPV'],
'Northern Africa': ['EGY', 'LBY', 'TUN', 'DZA', 'MAR', 'SDN'],
'Eastern Africa': ['ETH', 'KEN', 'UGA', 'TZA', 'RWA', 'BDI', 'SOM', 'ERI', 'DJI', 'COM', 'MUS', 'SYC', 'MDG'],
'Middle Africa': ['COD', 'AGO', 'CMR', 'TCD', 'CAF', 'COG', 'GAB', 'GNQ', 'STP']
}
# Create data for the map
map_data = []
subregion_counts = df.subregion.value_counts()
for subregion, countries in subregion_to_countries.items():
count = subregion_counts.get(subregion, 0)
percentage = count / len(df) * 100 if count > 0 else 0
for country in countries:
map_data.append({
'country': country,
'subregion': subregion,
'count': count,
'percentage': percentage
})
map_df = pd.DataFrame(map_data)
# Create the choropleth map
fig = px.choropleth(
map_df,
locations='country',
color='percentage',
hover_name='subregion',
hover_data={'count': True, 'percentage': ':.1f%'},
color_continuous_scale='Viridis',
title='Geographic Distribution of Names by Subregion<br><sub>Color intensity represents relative occurrence (%)</sub>',
labels={'percentage': 'Percentage (%)', 'count': 'Count'}
)
fig.update_layout(
title_font_size=16,
title_x=0.5,
geo=dict(
showframe=False,
showcoastlines=True,
projection_type='equirectangular'
),
coloraxis_colorbar=dict(
title="Percentage (%)",
title_font_size=12
),
width=1000,
height=600
)
fig.show()