i am trying to create a LogistcRegression classifier for a multilabel classification problem
traindf = pickle.load(open(self.dataLocation+"train"+self.fname+".pkl","rb"))
X, y = traindf[self.predX], traindf[self.predY]
from sklearn.preprocessing import MultiLabelBinarizer
y=MultiLabelBinarizer().fit_transform(y)
Xtrain, Xvalidate , ytrain, yvalidate = train_test_split(X, y, train_size=self.splitFactor)
pip = Pipeline([
('vect', TfidfVectorizer(
analyzer='word',
binary=False,
decode_error='ignore',
dtype=<type 'numpy.int64'>,
encoding=u'utf-8',
input=u'content',
lowercase=True,
max_df=0.25,
max_features=None,
min_df=1,
ngram_range=(1, 1),
norm=u'l2',
preprocessor=None,
smooth_idf=True,
stop_words='english',
strip_accents=None,
sublinear_tf=True,
token_pattern=u'(?u)\\b\\w\\w+\\b',
tokenizer=nltk.data.load('tokenizers/punkt/english.pickle'),
use_idf=True, vocabulary=None)),
('clf', LogisticRegression(
C=10,
class_weight=None,
dual=False,
fit_intercept=True,
intercept_scaling=1,
max_iter=100,
multi_class='multinomial',
n_jobs=1,
penalty='l2',
random_state=None,
solver='lbfgs',
tol=0.0001,
verbose=0,
warm_start=False))
])
parameters = {}
gridSearchTS = GridSearchCV(pip,parameters,n_jobs=3, verbose=1, scoring='accuracy')
gridSearchTS.fit(Xtrain, ytrain)
predictions = gridSearchTS.predict(Xvalidate )
print ('Accuracy:', accuracy_score(yvalidate, predictions))
print ('Confusion Matrix:', confusion_matrix(yvalidate, predictions))
print ('Classification Report:', classification_report(yvalidate, predictions))
testdf = pickle.load(open(self.dataLocation+"test"+self.fname+".pkl","rb"))
predictions=gridSearchTS.predict(testdf[self.predX])
testdf[self.predY] = predictions
print(testdf.info())
testdf.to_csv(self.resLocation+self.prefix+self.fname+".csv")
but i get the error
ValueError: bad input shape (326L, 559L)
the entire stacktrace is
gridSearchTS.fit(Xtrain, ytrain)
File "X:Anaconda2\lib\site-packages\sklearn\grid_search.py", line 804, in fit
return self._fit(X, y, ParameterGrid(self.param_grid))
File "X:Anaconda2\lib\site-packages\sklearn\grid_search.py", line 553, in _fit
for parameters in parameter_iterable
File "X:Anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.py", line 812, in __call__
self.retrieve()
File "X:Anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.py", line 762, in retrieve
raise exception
sklearn.externals.joblib.my_exceptions.JoblibValueError: JoblibValueError
___________________________________________________________________________
Multiprocessing exception:
...........................................................................
X:\myScript.py in getUniTags(self=<predict.RbcSolver.Predictor object>, multiNomial=True)
75 Xtrain, Xvalidate , ytrain, yvalidate = train_test_split(X, y, train_size=self.splitFactor)
76
77 parameters = {}
78
79 gridSearchTS = GridSearchCV(self.pipClassifier,parameters,n_jobs=3, verbose=1, scoring='accuracy')
---> 80 gridSearchTS.fit(Xtrain, ytrain)
gridSearchTS.fit = <bound method GridSearchCV.fit of GridSearchCV(c...obs', refit=True, scoring='accuracy', verbose=1)>
Xtrain = 123 <some text here>
Name: Content, dtype: object
ytrain = array([[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0,..., ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0]])
81
82 predictions = gridSearchTS.predict(Xvalidate )
83
84 print ('Accuracy:', accuracy_score(yvalidate, predictions))
...........................................................................
X:Anaconda2\lib\site-packages\sklearn\grid_search.py in fit(self=GridSearchCV(cv=None, error_score='raise',
...jobs', refit=True, scoring='accuracy', verbose=1), X = 123 <some text here>
Name: Content, dtype: object, y=array([[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0,..., ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0]]))
799 y : array-like, shape = [n_samples] or [n_samples, n_output], optional
800 Target relative to X for classification or regression;
801 None for unsupervised learning.
802
803 """
--> 804 return self._fit(X, y, ParameterGrid(self.param_grid))
self._fit = <bound method GridSearchCV._fit of GridSearchCV(...obs', refit=True, scoring='accuracy', verbose=1)>
X = 161 <some text here>
Name: Content, dtype: object
y = array([[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0,..., ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0]])
self.param_grid = {}
805
806
807 class RandomizedSearchCV(BaseSearchCV):
808 """Randomized search on hyper parameters.
...........................................................................
X:Anaconda2\lib\site-packages\sklearn\grid_search.py in _fit(self=GridSearchCV(cv=None, error_score='raise',
...jobs', refit=True, scoring='accuracy', verbose=1), X = 123 <some text here>
Name: Content, dtype: object, y=array([[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0,..., ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0]]), parameter_iterable=<sklearn.grid_search.ParameterGrid object>)
548 )(
549 delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_,
550 train, test, self.verbose, parameters,
551 self.fit_params, return_parameters=True,
552 error_score=self.error_score)
--> 553 for parameters in parameter_iterable
parameters = undefined
parameter_iterable = <sklearn.grid_search.ParameterGrid object>
554 for train, test in cv)
555
556 # Out is a list of triplet: score, estimator, n_test_samples
557 n_fits = len(out)
...........................................................................
X:Anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self=Parallel(n_jobs=3), iterable=<generator object <genexpr>>)
807 if pre_dispatch == "all" or n_jobs == 1:
808 # The iterable was consumed all at once by the above for loop.
809 # No need to wait for async callbacks to trigger to
810 # consumption.
811 self._iterating = False
--> 812 self.retrieve()
self.retrieve = <bound method Parallel.retrieve of Parallel(n_jobs=3)>
813 # Make sure that we get a last message telling us we are done
814 elapsed_time = time.time() - self._start_time
815 self._print('Done %3i out of %3i | elapsed: %s finished',
816 (len(self._output), len(self._output),
---------------------------------------------------------------------------
Sub-process traceback:
---------------------------------------------------------------------------
ValueError
PID: 5360Python 2.7.11: X:Anaconda2\python.exe
...........................................................................
X:Anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.pyc in __call__(self=<sklearn.externals.joblib.parallel.BatchedCalls object>)
67 def __init__(self, iterator_slice):
68 self.items = list(iterator_slice)
69 self._size = len(self.items)
70
71 def __call__(self):
---> 72 return [func(*args, **kwargs) for func, args, kwargs in self.items]
73
74 def __len__(self):
75 return self._size
76
...........................................................................
X:Anaconda2\lib\site-packages\sklearn\cross_validation.pyc in _fit_and_score(estimator=Pipeline(steps=[('vect', TfidfVectorizer(analyze... tol=0.0001, verbose=0, warm_start=False))]), X = 123 <some text here>
Name: Content, dtype: object, y=memmap([[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0..., ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0]]), scorer=make_scorer(accuracy_score), train=array([163, 164, 165, 166, 167, 168, 169, 170, 1...79, 480, 481, 482, 483, 484, 485, 486, 487, 488]), test=array([ 0, 1, 2, 3, 4, 5, 6, 7, ..., 155,
156, 157, 158, 159, 160, 161, 162]), verbose=1, parameters={}, fit_params={}, return_train_score=False, return_parameters=True, error_score='raise')
1526
1527 try:
1528 if y_train is None:
1529 estimator.fit(X_train, **fit_params)
1530 else:
-> 1531 estimator.fit(X_train, y_train, **fit_params)
1532
1533 except Exception as e:
1534 if error_score == 'raise':
1535 raise
...........................................................................
X:Anaconda2\lib\site-packages\sklearn\pipeline.pyc in fit(self=Pipeline(steps=[('vect', TfidfVectorizer(analyze... tol=0.0001, verbose=0, warm_start=False))]), X=29 research weeks feb rel sep hvlo diff clos...rd loihi diff aoo...
Name: Content, dtype: object, y=memmap([[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0..., ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0]]), **fit_params={})
160 y : iterable, default=None
161 Training targets. Must fulfill label requirements for all steps of
162 the pipeline.
163 """
164 Xt, fit_params = self._pre_transform(X, y, **fit_params)
--> 165 self.steps[-1][-1].fit(Xt, y, **fit_params)
166 return self
167
168 def fit_transform(self, X, y=None, **fit_params):
169 """Fit all the transforms one after the other and transform the
...........................................................................
X:Anaconda2\lib\site-packages\sklearn\linear_model\logistic.pyc in fit(self=LogisticRegression(C=10, class_weight=None, dual... tol=0.0001, verbose=0, warm_start=False), X=<326x17576 sparse matrix of type '<type 'numpy.f... stored elements in Compressed Sparse Row format>, y=memmap([[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0..., ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0]]), sample_weight=None)
1137 if not isinstance(self.tol, numbers.Number) or self.tol < 0:
1138 raise ValueError("Tolerance for stopping criteria must be "
1139 "positive; got (tol=%r)" % self.tol)
1140
1141 X, y = check_X_y(X, y, accept_sparse='csr', dtype=np.float64,
-> 1142 order="C")
1143 check_classification_targets(y)
1144 self.classes_ = np.unique(y)
1145 n_samples, n_features = X.shape
1146
...........................................................................
X:Anaconda2\lib\site-packages\sklearn\utils\validation.pyc in check_X_y(X=<326x17576 sparse matrix of type '<type 'numpy.f... stored elements in Compressed Sparse Row format>, y=memmap([[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0..., ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0]]), accept_sparse='csr', dtype=<type 'numpy.float64'>, order='C', copy=False, force_all_finite=True, ensure_2d=True, allow_nd=False, multi_output=False, ensure_min_samples=1, ensure_min_features=1, y_numeric=False, warn_on_dtype=False, estimator=None)
510 ensure_min_features, warn_on_dtype, estimator)
511 if multi_output:
512 y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False,
513 dtype=None)
514 else:
--> 515 y = column_or_1d(y, warn=True)
516 _assert_all_finite(y)
517 if y_numeric and y.dtype.kind == 'O':
518 y = y.astype(np.float64)
519
...........................................................................
X:Anaconda2\lib\site-packages\sklearn\utils\validation.pyc in column_or_1d(y=memmap([[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0..., ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0]]), warn=True)
546 " expected. Please change the shape of y to "
547 "(n_samples, ), for example using ravel().",
548 DataConversionWarning, stacklevel=2)
549 return np.ravel(y)
550
--> 551 raise ValueError("bad input shape {0}".format(shape))
552
553
554 def check_random_state(seed):
555 """Turn seed into a np.random.RandomState instance
ValueError: bad input shape (326L, 559L)
___________________________________________________________________________
how should i transform/format my X dimension?
From the docs for LogisticRegression.fit
:
y : array-like, shape (n_samples,)
So y has to be a 1-D array, but the output of your MulitlabelBinarizer will be a 0-1 matrix with 2 dimensions. It looks like its (326, 559), which would be 326 rows and 559 distinct classes. The format of y is explained in the Multilabel docs. You will have to put the LogisitcRegression model into a Mulitlabel classifier, like one-vs-rest, which is explained just below on that page. There is also a multilabel example.