Python Multinomial Logistic Regression : ValueError: bad input shape (326L, 559L)

AbtPst picture AbtPst · Dec 14, 2015 · Viewed 9.1k times · Source

i am trying to create a LogistcRegression classifier for a multilabel classification problem

    traindf = pickle.load(open(self.dataLocation+"train"+self.fname+".pkl","rb"))

    X, y = traindf[self.predX], traindf[self.predY]
    from sklearn.preprocessing import MultiLabelBinarizer  
    y=MultiLabelBinarizer().fit_transform(y) 

    Xtrain, Xvalidate , ytrain, yvalidate = train_test_split(X, y, train_size=self.splitFactor)



    pip = Pipeline([
('vect', TfidfVectorizer(
                        analyzer='word',
                        binary=False,
                        decode_error='ignore',
                        dtype=<type 'numpy.int64'>,
                        encoding=u'utf-8',
                        input=u'content',
                        lowercase=True,
                        max_df=0.25,
                        max_features=None,
                        min_df=1,
                        ngram_range=(1, 1),
                        norm=u'l2',
                        preprocessor=None,
                        smooth_idf=True,
                        stop_words='english',
                        strip_accents=None,
                        sublinear_tf=True,
                        token_pattern=u'(?u)\\b\\w\\w+\\b',
                        tokenizer=nltk.data.load('tokenizers/punkt/english.pickle'),
                        use_idf=True, vocabulary=None)),
('clf', LogisticRegression(
                        C=10,
                        class_weight=None,
                        dual=False,
                        fit_intercept=True,
                        intercept_scaling=1,
                        max_iter=100,
                        multi_class='multinomial',
                        n_jobs=1,
                        penalty='l2', 
                        random_state=None, 
                        solver='lbfgs',
                        tol=0.0001,
                        verbose=0, 
                        warm_start=False))
                ])

parameters = {}

   gridSearchTS = GridSearchCV(pip,parameters,n_jobs=3, verbose=1, scoring='accuracy')
    gridSearchTS.fit(Xtrain, ytrain)

    predictions = gridSearchTS.predict(Xvalidate )

    print ('Accuracy:', accuracy_score(yvalidate, predictions))
    print ('Confusion Matrix:', confusion_matrix(yvalidate, predictions))
    print ('Classification Report:', classification_report(yvalidate, predictions))

    testdf = pickle.load(open(self.dataLocation+"test"+self.fname+".pkl","rb"))

    predictions=gridSearchTS.predict(testdf[self.predX])

    testdf[self.predY] = predictions

    print(testdf.info())

    testdf.to_csv(self.resLocation+self.prefix+self.fname+".csv")

but i get the error

ValueError: bad input shape (326L, 559L)

the entire stacktrace is

gridSearchTS.fit(Xtrain, ytrain)
  File "X:Anaconda2\lib\site-packages\sklearn\grid_search.py", line 804, in fit
    return self._fit(X, y, ParameterGrid(self.param_grid))
  File "X:Anaconda2\lib\site-packages\sklearn\grid_search.py", line 553, in _fit
    for parameters in parameter_iterable
  File "X:Anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.py", line 812, in __call__
    self.retrieve()
  File "X:Anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.py", line 762, in retrieve
    raise exception
sklearn.externals.joblib.my_exceptions.JoblibValueError: JoblibValueError
___________________________________________________________________________
Multiprocessing exception:
...........................................................................
X:\myScript.py in getUniTags(self=<predict.RbcSolver.Predictor object>, multiNomial=True)
     75         Xtrain, Xvalidate , ytrain, yvalidate = train_test_split(X, y, train_size=self.splitFactor)
     76             
     77         parameters = {}
     78     
     79         gridSearchTS = GridSearchCV(self.pipClassifier,parameters,n_jobs=3, verbose=1, scoring='accuracy')
---> 80         gridSearchTS.fit(Xtrain, ytrain)
        gridSearchTS.fit = <bound method GridSearchCV.fit of GridSearchCV(c...obs', refit=True, scoring='accuracy', verbose=1)>
        Xtrain = 123     <some text here>
Name: Content, dtype: object
        ytrain = array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0,..., ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])
     81         
     82         predictions = gridSearchTS.predict(Xvalidate )
     83     
     84         print ('Accuracy:', accuracy_score(yvalidate, predictions))

...........................................................................
X:Anaconda2\lib\site-packages\sklearn\grid_search.py in fit(self=GridSearchCV(cv=None, error_score='raise',
     ...jobs', refit=True, scoring='accuracy', verbose=1), X = 123    <some text here>
Name: Content, dtype: object, y=array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0,..., ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]]))
    799         y : array-like, shape = [n_samples] or [n_samples, n_output], optional
    800             Target relative to X for classification or regression;
    801             None for unsupervised learning.
    802 
    803         """
--> 804         return self._fit(X, y, ParameterGrid(self.param_grid))
        self._fit = <bound method GridSearchCV._fit of GridSearchCV(...obs', refit=True, scoring='accuracy', verbose=1)>
        X = 161    <some text here>
Name: Content, dtype: object
        y = array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0,..., ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])
        self.param_grid = {}
    805 
    806 
    807 class RandomizedSearchCV(BaseSearchCV):
    808     """Randomized search on hyper parameters.

...........................................................................
X:Anaconda2\lib\site-packages\sklearn\grid_search.py in _fit(self=GridSearchCV(cv=None, error_score='raise',
     ...jobs', refit=True, scoring='accuracy', verbose=1), X = 123    <some text here>
Name: Content, dtype: object, y=array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0,..., ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]]), parameter_iterable=<sklearn.grid_search.ParameterGrid object>)
    548         )(
    549             delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_,
    550                                     train, test, self.verbose, parameters,
    551                                     self.fit_params, return_parameters=True,
    552                                     error_score=self.error_score)
--> 553                 for parameters in parameter_iterable
        parameters = undefined
        parameter_iterable = <sklearn.grid_search.ParameterGrid object>
    554                 for train, test in cv)
    555 
    556         # Out is a list of triplet: score, estimator, n_test_samples
    557         n_fits = len(out)

...........................................................................
X:Anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self=Parallel(n_jobs=3), iterable=<generator object <genexpr>>)
    807             if pre_dispatch == "all" or n_jobs == 1:
    808                 # The iterable was consumed all at once by the above for loop.
    809                 # No need to wait for async callbacks to trigger to
    810                 # consumption.
    811                 self._iterating = False
--> 812             self.retrieve()
        self.retrieve = <bound method Parallel.retrieve of Parallel(n_jobs=3)>
    813             # Make sure that we get a last message telling us we are done
    814             elapsed_time = time.time() - self._start_time
    815             self._print('Done %3i out of %3i | elapsed: %s finished',
    816                         (len(self._output), len(self._output),

---------------------------------------------------------------------------
Sub-process traceback:
---------------------------------------------------------------------------
ValueError                                         
PID: 5360Python 2.7.11: X:Anaconda2\python.exe
...........................................................................
X:Anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.pyc in __call__(self=<sklearn.externals.joblib.parallel.BatchedCalls object>)
     67     def __init__(self, iterator_slice):
     68         self.items = list(iterator_slice)
     69         self._size = len(self.items)
     70 
     71     def __call__(self):
---> 72         return [func(*args, **kwargs) for func, args, kwargs in self.items]
     73 
     74     def __len__(self):
     75         return self._size
     76 

...........................................................................
X:Anaconda2\lib\site-packages\sklearn\cross_validation.pyc in _fit_and_score(estimator=Pipeline(steps=[('vect', TfidfVectorizer(analyze...      tol=0.0001, verbose=0, warm_start=False))]), X = 123    <some text here>
Name: Content, dtype: object, y=memmap([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0..., ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]]), scorer=make_scorer(accuracy_score), train=array([163, 164, 165, 166, 167, 168, 169, 170, 1...79, 480, 481, 482, 483, 484, 485, 486, 487, 488]), test=array([  0,   1,   2,   3,   4,   5,   6,   7,  ..., 155,
       156, 157, 158, 159, 160, 161, 162]), verbose=1, parameters={}, fit_params={}, return_train_score=False, return_parameters=True, error_score='raise')
   1526 
   1527     try:
   1528         if y_train is None:
   1529             estimator.fit(X_train, **fit_params)
   1530         else:
-> 1531             estimator.fit(X_train, y_train, **fit_params)
   1532 
   1533     except Exception as e:
   1534         if error_score == 'raise':
   1535             raise

...........................................................................
X:Anaconda2\lib\site-packages\sklearn\pipeline.pyc in fit(self=Pipeline(steps=[('vect', TfidfVectorizer(analyze...      tol=0.0001, verbose=0, warm_start=False))]), X=29     research weeks feb rel sep hvlo diff clos...rd loihi diff aoo...
Name: Content, dtype: object, y=memmap([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0..., ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]]), **fit_params={})
    160         y : iterable, default=None
    161             Training targets. Must fulfill label requirements for all steps of
    162             the pipeline.
    163         """
    164         Xt, fit_params = self._pre_transform(X, y, **fit_params)
--> 165         self.steps[-1][-1].fit(Xt, y, **fit_params)
    166         return self
    167 
    168     def fit_transform(self, X, y=None, **fit_params):
    169         """Fit all the transforms one after the other and transform the

...........................................................................
X:Anaconda2\lib\site-packages\sklearn\linear_model\logistic.pyc in fit(self=LogisticRegression(C=10, class_weight=None, dual...         tol=0.0001, verbose=0, warm_start=False), X=<326x17576 sparse matrix of type '<type 'numpy.f... stored elements in Compressed Sparse Row format>, y=memmap([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0..., ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]]), sample_weight=None)
   1137         if not isinstance(self.tol, numbers.Number) or self.tol < 0:
   1138             raise ValueError("Tolerance for stopping criteria must be "
   1139                              "positive; got (tol=%r)" % self.tol)
   1140 
   1141         X, y = check_X_y(X, y, accept_sparse='csr', dtype=np.float64, 
-> 1142                          order="C")
   1143         check_classification_targets(y)
   1144         self.classes_ = np.unique(y)
   1145         n_samples, n_features = X.shape
   1146 

...........................................................................
X:Anaconda2\lib\site-packages\sklearn\utils\validation.pyc in check_X_y(X=<326x17576 sparse matrix of type '<type 'numpy.f... stored elements in Compressed Sparse Row format>, y=memmap([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0..., ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]]), accept_sparse='csr', dtype=<type 'numpy.float64'>, order='C', copy=False, force_all_finite=True, ensure_2d=True, allow_nd=False, multi_output=False, ensure_min_samples=1, ensure_min_features=1, y_numeric=False, warn_on_dtype=False, estimator=None)
    510                     ensure_min_features, warn_on_dtype, estimator)
    511     if multi_output:
    512         y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False,
    513                         dtype=None)
    514     else:
--> 515         y = column_or_1d(y, warn=True)
    516         _assert_all_finite(y)
    517     if y_numeric and y.dtype.kind == 'O':
    518         y = y.astype(np.float64)
    519 

...........................................................................
X:Anaconda2\lib\site-packages\sklearn\utils\validation.pyc in column_or_1d(y=memmap([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0..., ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]]), warn=True)
    546                           " expected. Please change the shape of y to "
    547                           "(n_samples, ), for example using ravel().",
    548                           DataConversionWarning, stacklevel=2)
    549         return np.ravel(y)
    550 
--> 551     raise ValueError("bad input shape {0}".format(shape))
    552 
    553 
    554 def check_random_state(seed):
    555     """Turn seed into a np.random.RandomState instance

ValueError: bad input shape (326L, 559L)
___________________________________________________________________________

how should i transform/format my X dimension?

Answer

Dthal picture Dthal · Dec 19, 2015

From the docs for LogisticRegression.fit:

y : array-like, shape (n_samples,)

So y has to be a 1-D array, but the output of your MulitlabelBinarizer will be a 0-1 matrix with 2 dimensions. It looks like its (326, 559), which would be 326 rows and 559 distinct classes. The format of y is explained in the Multilabel docs. You will have to put the LogisitcRegression model into a Mulitlabel classifier, like one-vs-rest, which is explained just below on that page. There is also a multilabel example.