Привет всем. По сабжу: есть сеть, написанная на keras, её задача - распознать сентимент текста, а именно - хороший комментарий к фильму, или плохой(использовал дата сет imdb). Точность на выходе очевидно не 100%, значит есть тексты, которые классифицируются неверно. Офк я попробовал вывести те самые тексты, и даже есть кое-какие результаты, но тексты постоянно одни и те же, что и настораживает меня. Скорее всего я где-то ошибся, но где?
Вот сам скрипt
pos_train_data = pd.read_csv('train_pos.tsv',sep = '\t')
neg_train_data = pd.read_csv('train_neg.tsv',sep = '\t')
pos_test_data = pd.read_csv('test_pos.tsv',sep = '\t')
neg_test_data = pd.read_csv('test_neg.tsv',sep = '\t')
pos_train_data = pos_train_data[['Text','Sentiment']]
neg_train_data = neg_train_data[['Text','Sentiment']]
pos_test_data = pos_test_data[['Text','Sentiment']]
neg_test_data = neg_test_data[['Text','Sentiment']]
data_train = pd.concat([pos_train_data,neg_train_data],ignore_index = True)
data_train = data_train.sample(frac=1).reset_index(drop=True)
#print(data_train.head())
data_test = pd.concat([pos_test_data,neg_test_data],ignore_index = True)
data_test = data_test.sample(frac=1).reset_index(drop=True)
#print(data_test.head())
stop_words = set(stopwords.words('english'))
table = str.maketrans('', '', punctuation)
def textclean(text):
#tokens = word_tokenize(text)
tokens = (text.lower()).split()
tokens = [word for word in tokens if word.isalpha()]
tokens = [w.translate(table) for w in tokens]
tokens = [word for word in tokens if not word in stop_words]
tokens = [word for word in tokens if len(word) > 1]
return tokens
def review_to_words(text):
clean_text = BeautifulSoup(text, "html5lib").get_text()
clean_text = re.sub(r"[^a-zA-Z]", " ", clean_text)
words = (clean_text.lower()).split()
words = [w for w in words if w not in stopwords.words("english")]
return words
reviews = []
for index,row in data_train.iterrows():
text = (row['Text'].lower())
reviews.append(textclean(text))
print(reviews[0])
linked_reviews = list(itertools.chain.from_iterable(reviews))
vocab_freq = dict()
#print(linked_reviews[1])
for word in linked_reviews:
if word not in vocab_freq:
vocab_freq[word] = 1
else:
vocab_freq[word] += 1
sorted_vocab_freq = list(reversed(sorted(vocab_freq.items(), key=operator.itemgetter(1))))
print(sorted_vocab_freq)
print(len(sorted_vocab_freq))
TOTAL_VOCAB = 5000
word_to_id = dict()
id_to_word = dict()
for i in range(TOTAL_VOCAB):
word_to_id[sorted_vocab_freq[i][0]] = i
id_to_word[i] = sorted_vocab_freq[i][0]
print(id_to_word[0])
#review_lengths
review_lengths = pd.DataFrame([len(review) for review in reviews])
review_lengths.columns = ['Len']
print(review_lengths)
#stats
print(review_lengths.describe())
def convert(l):
new_l = []
for word in l:
if word in word_to_id:
new_l.append(word_to_id[word])
return new_l
#print(len(data_train['Sentiment']))
X_train = []
y_train = []
#Tukey's method
first_q = review_lengths.Len.quantile([0.25])[0.25]
third_q = review_lengths.Len.quantile([0.75])[0.75]
upper_threshold = third_q + 1.5*(third_q-first_q)
lower_threshold = first_q - 1.5*(third_q-first_q)
print(upper_threshold,lower_threshold)
for i in range(len(data_train)):
converted_review = convert(reviews[i])
if len(converted_review) <= upper_threshold:
X_train.append(converted_review)
y_train.append(data_train['Sentiment'][i])
X_train = np.array(X_train)
y_train = np.array(y_train)
print(X_train)
print(y_train)
X_train = sequence.pad_sequences(X_train, maxlen=int(upper_threshold),value = 0)
print(X_train.shape,y_train.shape)
data_test = pd.concat([pos_test_data,pos_test_data, neg_test_data], ignore_index=True)
data_test = data_test.sample(frac=0.3).reset_index(drop=True)
print(data_test)
print(pos_test_data)
validation_reviews = []
for index, row in data_test.iterrows():
text = (row['Text'].lower())
validation_reviews.append(textclean(text))
X_val = []
y_val = []
for i in range(len(data_test)):
converted_review = convert(validation_reviews[i])
if len(converted_review) <= upper_threshold:
X_val.append(converted_review)
y_val.append(data_test['Sentiment'][i])
X_val = np.array(X_val)
X_val = sequence.pad_sequences(X_val, maxlen=int(upper_threshold), value=0)
print(X_val)
y_val = np.array(y_val)
#print(X_train)
#print(y_train)
EMBEDDING_LEN = 32
model = Sequential()
model.add(Embedding(TOTAL_VOCAB,EMBEDDING_LEN,input_length = int(upper_threshold)))
model.add(Conv1D(128,3,padding = 'same'))
model.add(Conv1D(64,3,padding = 'same'))
model.add(Conv1D(32,2,padding = 'same'))
model.add(Conv1D(16,2,padding = 'same'))
model.add(Flatten())
model.add(Dropout(0.25))
model.add(Dense(100,activation = 'relu'))
model.add(Dropout(0.25))
model.add(Dense(1,activation='sigmoid'))
model.summary()
opt = optimizers.Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(loss = 'binary_crossentropy',optimizer = opt ,metrics = ['accuracy'])
model.fit(X_train,y_train,validation_data = (X_val,y_val),epochs = 1 ,batch_size = 1000)
y_pred_vect = model.predict(X_val)
# bolean mask
mask = (y_pred_vect != y_val).any(axis=1)
print(mask)
print(len(mask))
num_words=1000 # only use top 1000 words
INDEX_FROM=3 # word index offset
# этот шаг нужен чтобы получить `test_x` в изначальном виде (до токенизации):
(train_x, _), (test_x, _) = imdb.load_data(num_words=num_words, index_from=index_from)
x_wrong = test_x[mask]
word_to_id = imdb.get_word_index()
word_to_id = {k:(v+index_from) for k,v in word_to_id.items()}
word_to_id["<PAD>"] = 0
word_to_id["<START>"] = 1
word_to_id["<UNK>"] = 2
id_to_word = {value:key for key,value in word_to_id.items()}
all_wrong_sents = [' '.join(id_to_word[id] for id in sent) for sent in x_wrong]
Cобственно это часть кода, отвечающая за вывод текстов.
y_pred_vect = model.predict(X_val)
# bolean mask
mask = (y_pred_vect != y_val).any(axis=1)
print(mask)
print(len(mask))
num_words=1000 # only use top 1000 words
INDEX_FROM=3 # word index offset
# этот шаг нужен чтобы получить `test_x` в изначальном виде (до токенизации):
(train_x, _), (test_x, _) = imdb.load_data(num_words=num_words, index_from=index_from)
x_wrong = test_x[mask]
word_to_id = imdb.get_word_index()
word_to_id = {k:(v+index_from) for k,v in word_to_id.items()}
word_to_id["<PAD>"] = 0
word_to_id["<START>"] = 1
word_to_id["<UNK>"] = 2
id_to_word = {value:key for key,value in word_to_id.items()}
all_wrong_sents = [' '.join(id_to_word[id] for id in sent) for sent in x_wrong]