быстрый способ, если столбцов очень много
import sys
import os
import zipfile
import shutil
import random
import logging
import gdown
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image, ImageOps
from joblib import Parallel, delayed
import torch
import torch.nn as nn
from torchvision import models, transforms
from torchvision.models import VGG16_Weights
try:
from google.colab import drive
drive.mount("/content/drive")
DRIVE_DIR = os.path.join("/content/drive", "MyDrive")
except ImportError:
DRIVE_DIR = os.getcwd()
DATASET_DIR = os.path.join(os.getcwd(), "dataset")
TEMP_DIR = os.path.join(os.getcwd(), "temp")
ZIP_PATH = os.path.join(DRIVE_DIR, "dataset_32_classes.zip")
os.makedirs(DATASET_DIR, exist_ok=True)
# добавляем главную директорию в путь
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
sys.path.append(nb_dir)
from Tools import find_image_files
RANDOM_STATE = 42
random.seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)
pd.options.display.float_format = "{:.4f}".format
/home/milia/miniforge3/envs/rapids-24.10/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
file_id = "1FKZ9oHZ3zFMoFJX2f2aI34M2XZ2ikSb0"
if os.path.exists(ZIP_PATH):
print("Архив уже добавлен")
else:
gdown.download(
f"https://drive.google.com/uc?id={file_id}",
os.path.join(os.getcwd(), "dataset_32_classes.zip"),
quiet=False,
)
# Распаковка архива
with zipfile.ZipFile(ZIP_PATH, "r") as zip_ref:
zip_ref.extractall("./dataset")
classes = os.listdir(DATASET_DIR)
# Проверим структуру папок
assert len(classes) == 32
print(f"Количество папок: {len(classes)}")
print(torch.cuda.is_available())
if torch.cuda.is_available():
print(torch.cuda.device_count())
print(torch.cuda.current_device())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vgg16 = models.vgg16(weights=VGG16_Weights.DEFAULT)
if torch.cuda.is_available():
vgg16 = vgg16.cuda()
# Убираем последний слой
feature_extractor = nn.Sequential(*list(vgg16.children())[:-1])
feature_extractor.cuda()
feature_extractor.eval() # Установить в режим оценки
True 1 0
Sequential(
(0): Sequential(
(0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(1): ReLU(inplace=True)
(2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(3): ReLU(inplace=True)
(4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(6): ReLU(inplace=True)
(7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(8): ReLU(inplace=True)
(9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(11): ReLU(inplace=True)
(12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(13): ReLU(inplace=True)
(14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(15): ReLU(inplace=True)
(16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(17): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(18): ReLU(inplace=True)
(19): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(20): ReLU(inplace=True)
(21): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(22): ReLU(inplace=True)
(23): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(24): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(25): ReLU(inplace=True)
(26): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(27): ReLU(inplace=True)
(28): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(29): ReLU(inplace=True)
(30): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
)
(1): AdaptiveAvgPool2d(output_size=(7, 7))
)
Предварительная обработка для VGG16 (или ResNet50)
preprocess = transforms.Compose(
[
# transforms.Resize((224, 224)), # Изменить размер изображения
transforms.Resize((128, 128)),
transforms.ToTensor(), # Преобразовать изображение в тензор
transforms.Normalize(
mean=[
0.485,
0.456,
0.406,
], # Нормализация с использованием статистики над датасетом ImageNet
std=[0.229, 0.224, 0.225],
),
]
)
параметры Normalize были взяты из документации pytorch
Функция предварительной обработки одного изображения
def preprocess_image(image_path):
try:
image = Image.open(image_path).convert("RGB")
except Exception as e:
logging.error(f"error {e} for {path}")
# image = Image.open(r".\dataset\Apple\10_10_100.jpg").convert("RGB")
return preprocess(image).unsqueeze(0)
Получение по N изображений каждого класса.
def get_number_images(number, directory):
return_images = []
return_class = []
for image_class in os.listdir(directory):
temp_images = find_image_files(directory + "/" + image_class)
return_images += temp_images[:number]
return_class += [image_class] * number
return return_images, return_class
Функция извлечения и вытаскивания параметров из изображений.
Для ускорения процесса испольузем вычисления на видеокарте, что сокращает время обработки с 1 часа до 4 минут при размерах изображения 224
def extract_features_from_dataset(image_paths: str | list[str], feature_extractor=feature_extractor):
all_features = None
if isinstance(image_paths, str):
image_paths = [image_paths]
for path in image_paths:
with torch.no_grad():
input_tensor = preprocess_image(path)
features = feature_extractor(input_tensor.cuda()).cuda()
try:
all_features = torch.cat((all_features, features.view(features.size(0), -1)), dim=0)
except TypeError:
all_features = features.view(features.size(0), -1)
return all_features
похоже из-за доступного кеша, пока tensor не разросся он реально имеет буст при обработке на видеокатре
# def multi_extract_features_from_dataset(ndarray, names):
# ans = []
# t = 0
# for i in range(ndarray, 0 , 400):
# print(i)
# ans += map(extract_features_from_dataset, ndarray[t:i+1])
# t+=i
# return ans, names
def multi_extract_features_from_dataset(
ndarray, names
): # похоже из-за доступного кеша, пока tensor не разросся он реально имеет буст какой-то
return list(map(extract_features_from_dataset, ndarray)), names
image_number = 600
image_paths, labels = get_number_images(image_number, DATASET_DIR)
# image_paths = [item for item in zip(image_paths, labels)]
# dataset_features = extract_features_from_dataset(image_paths, feature_extractor)
# print("Dataset features shape:", dataset_features.shape)
N_CORES = 12 # количество задействованных ядер процессора
list_array = np.array_split(image_paths, N_CORES)
labels_array = np.array_split(labels, N_CORES)
data = Parallel(n_jobs=N_CORES, verbose=11)(
delayed(multi_extract_features_from_dataset)(array, names) for array, names in zip(list_array, labels_array)
)
[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers. [Parallel(n_jobs=12)]: Done 1 tasks | elapsed: 39.5s [Parallel(n_jobs=12)]: Done 3 out of 12 | elapsed: 41.5s remaining: 2.1min [Parallel(n_jobs=12)]: Done 5 out of 12 | elapsed: 42.3s remaining: 59.2s [Parallel(n_jobs=12)]: Done 7 out of 12 | elapsed: 43.1s remaining: 30.8s [Parallel(n_jobs=12)]: Done 9 out of 12 | elapsed: 43.8s remaining: 14.6s [Parallel(n_jobs=12)]: Done 12 out of 12 | elapsed: 44.8s finished
import gc
gc.collect()
0
собираем тензоры в удобный вид
расчет на видеокарте с помощью RAPIDS
%%time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import cuml
from cuml.svm import LinearSVC
encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(names_df[0])
# Split into training and test
X_train, X_test, y_train, y_test = train_test_split(df, encoded_labels, test_size=0.4, random_state=42)
cuml.DBSCAN(max_mbytes_per_batch=3000)
# Train an SVM
# svm = SVC(kernel='rbf', probability=True, C=1.0, cache_size = 3000)
svm = LinearSVC(loss="squared_hinge", penalty="l1", probability=True, C=1.0)
svm.fit(X_train.to_numpy(), y_train)
# Test the SVM
accuracy = svm.score(X_test, y_test)
print("SVM test accuracy:", accuracy)
accuracy = svm.score(X_train, y_train)
print("SVM train accuracy:", accuracy)
# Прогнозирование на тренировочных данных
y_pred_train = svm.predict(X_train)
# Оценка точности на тренировочных данных
train_accuracy = accuracy_score(y_train, y_pred_train)
print(f"Точность на тренировочных данных: {train_accuracy:.4f}")
# Матрица неточностей на тренировочных данных
cm_train = confusion_matrix(y_train, y_pred_train)
print("Матрица неточностей на тренировочных данных:")
print(cm_train)
[W] [23:03:17.708637] QWL-QN: max iterations reached [W] [23:03:17.708835] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data. [W] [23:03:30.706130] QWL-QN: max iterations reached [W] [23:03:30.706305] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data. [W] [23:03:44.134167] QWL-QN: max iterations reached [W] [23:03:44.134334] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data. [W] [23:04:27.473269] QWL-QN: max iterations reached [W] [23:04:27.473447] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data. [W] [23:04:56.819098] QWL-QN: max iterations reached [W] [23:04:56.819264] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data. [W] [23:05:24.816835] QWL-QN: max iterations reached [W] [23:05:24.817002] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data. [W] [23:05:38.143960] QWL-QN: max iterations reached [W] [23:05:38.144118] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data. [W] [23:05:52.290765] QWL-QN: max iterations reached [W] [23:05:52.290937] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data. [W] [23:06:05.796730] QWL-QN: max iterations reached [W] [23:06:05.796920] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data. [W] [23:06:17.617096] QWL-QN: max iterations reached [W] [23:06:17.617269] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data. [W] [23:06:44.139395] QWL-QN: max iterations reached [W] [23:06:44.139573] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data. [W] [23:06:57.366305] QWL-QN: max iterations reached [W] [23:06:57.366470] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data. [W] [23:07:10.267341] QWL-QN: max iterations reached [W] [23:07:10.267520] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data. [W] [23:07:35.924690] QWL-QN: max iterations reached [W] [23:07:35.924862] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data. [W] [23:08:21.357464] QWL-QN: max iterations reached [W] [23:08:21.357630] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data. [W] [23:08:34.489716] QWL-QN: max iterations reached [W] [23:08:34.489890] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data. [W] [23:09:11.588510] QWL-QN: max iterations reached [W] [23:09:11.588672] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data. [W] [23:09:25.574546] QWL-QN: max iterations reached [W] [23:09:25.574718] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data. SVM test accuracy: 0.9669271111488342 SVM train accuracy: 1.0 Точность на тренировочных данных: 1.0000 Матрица неточностей на тренировочных данных: [[361 0 0 ... 0 0 0] [ 0 362 0 ... 0 0 0] [ 0 0 377 ... 0 0 0] ... [ 0 0 0 ... 354 0 0] [ 0 0 0 ... 0 367 0] [ 0 0 0 ... 0 0 375]] CPU times: user 6min 21s, sys: 1.49 s, total: 6min 23s Wall time: 6min 23s
SCV не удалось использовать т. к. в исходном коде есть алокация памяти, но нет ее освобождения. SVC в версии 24.12 и 25.02a еще не имеют исправления утечки памяти в реализации SVM issue
encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(names_df[0])
X_train, X_test, y_train, y_test = train_test_split(df, encoded_labels, test_size=0.4, random_state=42)
cuml.DBSCAN(max_mbytes_per_batch=4000)
# Train an SVM
# svm = SVC(kernel='rbf', probability=True, C=1.0, cache_size = 3000)
svm = LinearSVC(loss="squared_hinge", penalty="l1", probability=True, C=1.0)
svm.fit(X_train.to_numpy(), y_train)
# Test the SVM
accuracy = svm.score(X_test, y_test)
print("SVM test accuracy:", accuracy)
accuracy = svm.score(X_train, y_train)
print("SVM train accuracy:", accuracy)
# Прогнозирование на тренировочных данных
y_pred_train = svm.predict(X_train)
# Оценка точности на тренировочных данных
train_accuracy = accuracy_score(y_train, y_pred_train)
print(f"Точность на тренировочных данных: {train_accuracy:.4f}")
# Матрица неточностей на тренировочных данных
cm_train = confusion_matrix(y_train, y_pred_train)
print("Матрица неточностей на тренировочных данных:")
print(cm_train)
import pickle
filename = "LinearSVC_model.pickle"
pickle.dump(svm, open(filename, "wb"))
Итоги¶
| Модель | Гиперпараметры | Размер изображения | Цветное | accuracy на трейне | accuracy на test | Время извлечения признаков датасета | Время обучения модели |
|---|---|---|---|---|---|---|---|
| LinearSVC GPU | C=1, loss='squared_hinge', penalty='l1' | 128px | да | 1.0 | 0.96 | ≈4 мин | ≈6 мин |
| LinearSVC GPU | C=1, loss='squared_hinge', penalty='l1' | 128px | да | 1.0 | 0.6 | ≈4 мин | ≈6 мин |
| SVC CPU | C=1, kernel='linear' | 224px | да | 1.0 | 0.96 | ≈4 мин | ≈1.5 ч |
| SVC GPU | C=1, kernel='linear' | 224px | да | ? | ? | ≈error | ≈error |
Из-за нелинейно растущей (O(n^2)) сложности SVM желательно тренировать модель в google colab или на видеокарте