-1
>>> import nltk
>>> from nltk.corpus import PlaintextCorpusReader
>>> meucorpus='C:\Users\dudu\Desktop\Artigos sem acentos'
>>> meustextos=PlaintextCorpusReader(meucorpus,'.*')
>>> meustextos.words()
Traceback (most recent call last):
File "<pyshell#4>", line 1, in <module>
meustextos.words()
File "C:\Python27\lib\site-packages\nltk\compat.py", line 498, in wrapper
return method(self).encode('ascii', 'backslashreplace')
File "C:\Python27\lib\site-packages\nltk\util.py", line 664, in __repr__
for elt in self:
File "C:\Python27\lib\site-packages\nltk\corpus\reader\util.py", line 394, in iterate_from
for tok in piece.iterate_from(max(0, start_tok-offset)):
File "C:\Python27\lib\site-packages\nltk\corpus\reader\util.py", line 291, in iterate_from
tokens = self.read_block(self._stream)
File "C:\Python27\lib\site-packages\nltk\corpus\reader\plaintext.py", line 117, in _read_word_block
words.extend(self._word_tokenizer.tokenize(stream.readline()))
File "C:\Python27\lib\site-packages\nltk\data.py", line 1102, in readline
new_chars = self._read(readsize)
File "C:\Python27\lib\site-packages\nltk\data.py", line 1329, in _read
chars, bytes_decoded = self._incr_decode(bytes)
File "C:\Python27\lib\site-packages\nltk\data.py", line 1359, in _incr_decode
return self.decode(bytes, 'strict')
File "C:\Python27\lib\encodings\utf_8.py", line 16, in decode
return codecs.utf_8_decode(input, errors, True)
UnicodeDecodeError: 'utf8' codec can't decode byte 0xff in position 0: invalid start byte
First - note that this type of path
>>> meucorpus='C:\Users\dudu\Desktop\Artigos sem acentos'
only worked by chance: the backslash ( ) is used in Python, C and other languages as an escape character, giving a special meaning to some characters that follow it. Use the forward bar (/
) to separate folders, or two bars ( ) (which are always interpreted as a single bar).– jsbueno