https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge
初始模型
我们以Working With Text Data中提到的方法作为初始模型,
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import logging
from optparse import OptionParser
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
if __name__ == '__main__':
FORMAT = '[%(asctime)-15s] %(message)s'
logging.basicConfig(level=logging.DEBUG, format=FORMAT)
parser = OptionParser()
parser.add_option("--training_data", dest = "training_data", metavar = "FILE", help = "training data")
parser.add_option("--testing_data", dest = "testing_data", metavar = "FILE", help = "testing data")
if len(sys.argv) == 1:
parser.print_help()
exit()
(options, args) = parser.parse_args()
df = pd.read_csv(options.training_data, dtype=object)
clfs = []
for label in ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']:
logging.info("processing " + label + "...")
text_clf = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', SGDClassifier(loss='log', penalty='l2',
alpha=1e-3, random_state=42,
max_iter=5, tol=None)),
])
text_clf.fit(df['comment_text'], df[label])
clfs.append(text_clf)
df = pd.read_csv(options.testing_data, dtype=object)
df_out_list = []
for i, row in df.iterrows():
new_item = [row['id']]
for clf in clfs:
predicted = clf.predict_proba([row['comment_text']])
new_item.append(predicted[0][1])
df_out_list.append(new_item)
df_out = pd.DataFrame(df_out_list, columns=('id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'))
df_out.to_csv(options.testing_data + '.out', index=False)
Model | Private Score |
---|---|
Working With Text Data(Benchmark) | 0.8635 |