https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge

初始模型

我们以Working With Text Data中提到的方法作为初始模型,

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys
import logging
from optparse import OptionParser
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier

if __name__ == '__main__':

    FORMAT = '[%(asctime)-15s] %(message)s'
    logging.basicConfig(level=logging.DEBUG, format=FORMAT)

    parser = OptionParser()
    parser.add_option("--training_data", dest = "training_data", metavar = "FILE", help = "training data")
    parser.add_option("--testing_data", dest = "testing_data", metavar = "FILE", help = "testing data")

    if len(sys.argv) == 1:
        parser.print_help()
        exit()

    (options, args) = parser.parse_args()

    df = pd.read_csv(options.training_data, dtype=object)

    clfs = []
    for label in ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']:
        logging.info("processing " + label + "...")
        text_clf = Pipeline([('vect', CountVectorizer()),
                             ('tfidf', TfidfTransformer()),
                             ('clf', SGDClassifier(loss='log', penalty='l2',
                                                   alpha=1e-3, random_state=42,
                                                   max_iter=5, tol=None)),
                            ])
        text_clf.fit(df['comment_text'], df[label])
        clfs.append(text_clf)

    df = pd.read_csv(options.testing_data, dtype=object)

    df_out_list = []
    for i, row in df.iterrows():
        new_item = [row['id']]
        for clf in clfs:
            predicted = clf.predict_proba([row['comment_text']])
            new_item.append(predicted[0][1])
        df_out_list.append(new_item)
    df_out = pd.DataFrame(df_out_list, columns=('id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'))
    df_out.to_csv(options.testing_data + '.out', index=False)
Model Private Score
Working With Text Data(Benchmark) 0.8635

预处理

模型

单一模型

集成模型

results matching ""

    No results matching ""