import pandas as pd
#модели (алгоритмы)
from sklearn.dummy import DummyClassifier # <== Простейшая возможная модель
from sklearn.linear_model import LogisticRegression # <== Логистическая (линейная)регрессия
#метрика успеха
from sklearn.metrics import accuracy_score
df = pd.read_csv("../input/polish_names.csv")
df.head()
df.info()
df.sample(10)
df['gender'].value_counts()
def transform_string_into_number(value):
return value
df['gender'].head().map( transform_string_into_number )
def transform_string_into_number(value):
return int(value == 'm')
df['gender'].head().map( transform_string_into_number )
df['target'] = df['gender'].map( lambda x: int(x == 'm') )
df.head(10)
X = df[ ['len_name'] ].values
y = df['target'].values
model = DummyClassifier()
model.fit(X, y)
y_pred = model.predict(X)
df['gender_pred'] = y_pred
df['gender_pred'].value_counts()
df[ df.target != y_pred ].shape # неправильный ответ
accuracy_score(y, y_pred)
model = DummyClassifier(random_state=0)
model.fit(X, y)
y_pred = model.predict(X)
accuracy_score(y, y_pred)
model = LogisticRegression(solver='lbfgs')
model.fit(X, y)
y_pred = model.predict(X)
accuracy_score(y, y_pred)
df['gender_pred'] = y_pred
df['gender_pred'].value_counts()
y_pred = [1]*X.shape[0] #количество единиц должно совпадать с количеством строк в матрице X
accuracy_score(y, y_pred)
def train_and_predict_model(X, y, model, success_metric=accuracy_score):
model.fit(X, y)
y_pred = model.predict(X)
print("Distribution:")
print( pd.Series(y_pred).value_counts() )
return success_metric(y, y_pred)
vowels = ['a', 'ą', 'e', 'ę', 'i', 'o', 'u', 'y']
def how_many_vowels(name):
count = sum( map(lambda x: int(x in vowels), name.lower()) )
return count
#how_many_vowels('Jana')
df['count_vowels'] = df['name'].map(how_many_vowels)
train_and_predict_model(df[['len_name', 'count_vowels'] ], y, LogisticRegression(solver='lbfgs'))
def first_is_vowel(name):
return name.lower()[0] in vowels
#first_is_vowel('Ada')
df['first_is_vowel'] = df['name'].map(first_is_vowel)
train_and_predict_model(df[['len_name', 'first_is_vowel'] ], y, LogisticRegression(solver='lbfgs'))
X = df[['len_name', 'count_vowels', 'first_is_vowel'] ]
train_and_predict_model(X, y, LogisticRegression(solver='lbfgs'))
pd.factorize(['blue', 'green', 'yellow', 'blue'])
pd.factorize(['blue', 'green', 'yellow', 'blue'])[0]
df['first_letter'] = df['name'].map(lambda x: x.lower()[0])
df['first_letter_cnt'] = df['first_letter'].factorize()[0]
X = df[['len_name', 'count_vowels', 'first_is_vowel', 'first_letter_cnt'] ]
train_and_predict_model(X, y, LogisticRegression(solver='lbfgs'))
def get_all_vowels(name):
all_vowels = [letter for letter in name.lower() if letter in vowels]
return ''.join(all_vowels)
#get_all_vowels('Sławomir')
df['all_vowels'] = df['name'].map(get_all_vowels)
df['all_vowels_cnt'] = pd.factorize(df['all_vowels'])[0]
X = df[['len_name', 'count_vowels', 'first_is_vowel', 'first_letter_cnt', 'all_vowels_cnt'] ]
train_and_predict_model(X, y, LogisticRegression(solver='lbfgs'))
def get_all_consonants(name):
all_consonants = [letter for letter in name.lower() if letter not in vowels]
return ''.join(all_consonants)
#get_all_consonants('Sławomir')
df['all_consonants'] = df['name'].map(get_all_consonants)
df['all_consonants_cnt'] = pd.factorize(df['all_consonants'])[0]
X = df[['len_name', 'count_vowels', 'first_is_vowel', 'first_letter_cnt', 'all_consonants_cnt'] ]
train_and_predict_model(X, y, LogisticRegression(solver='lbfgs', max_iter=200))
def last_is_vowel(name):
return name.lower()[-1] in vowels
#last_is_vowel('Ada')
df['last_is_vowel'] = df['name'].map(last_is_vowel)
X = df[['last_is_vowel'] ]
train_and_predict_model(X, y, LogisticRegression(solver='lbfgs', max_iter=200))
feats = ['last_is_vowel', 'len_name', 'count_vowels', 'first_is_vowel', 'all_vowels_cnt', 'all_consonants_cnt']
X = df[ feats ]
train_and_predict_model(X, y, LogisticRegression(solver='lbfgs', max_iter=200))
df.columns
df['lst_letter_a'] = df.name.map(lambda x: x[-1] == 'a')
df[ (df.gender == 'm') & df.lst_letter_a ]
df[ (df.gender == 'f') & (~df.lst_letter_a) ]