быстрый способ, если столбцов очень много

In [1]:

Copied!





import sys
import os
import zipfile
import shutil
import random
import logging
import gdown
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from PIL import Image, ImageOps
from joblib import Parallel, delayed


import torch
import torch.nn as nn
from torchvision import models, transforms
from torchvision.models import VGG16_Weights

try:
    from google.colab import drive

    drive.mount("/content/drive")
    DRIVE_DIR = os.path.join("/content/drive", "MyDrive")
except ImportError:
    DRIVE_DIR = os.getcwd()


DATASET_DIR = os.path.join(os.getcwd(), "dataset")
TEMP_DIR = os.path.join(os.getcwd(), "temp")
ZIP_PATH = os.path.join(DRIVE_DIR, "dataset_32_classes.zip")
os.makedirs(DATASET_DIR, exist_ok=True)

# добавляем главную директорию в путь
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

from Tools import find_image_files


RANDOM_STATE = 42
random.seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)
pd.options.display.float_format = "{:.4f}".format
import sys
import os
import zipfile
import shutil
import random
import logging
import gdown
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from PIL import Image, ImageOps
from joblib import Parallel, delayed


import torch
import torch.nn as nn
from torchvision import models, transforms
from torchvision.models import VGG16_Weights

try:
    from google.colab import drive

    drive.mount("/content/drive")
    DRIVE_DIR = os.path.join("/content/drive", "MyDrive")
except ImportError:
    DRIVE_DIR = os.getcwd()


DATASET_DIR = os.path.join(os.getcwd(), "dataset")
TEMP_DIR = os.path.join(os.getcwd(), "temp")
ZIP_PATH = os.path.join(DRIVE_DIR, "dataset_32_classes.zip")
os.makedirs(DATASET_DIR, exist_ok=True)

# добавляем главную директорию в путь
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

from Tools import find_image_files


RANDOM_STATE = 42
random.seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)
pd.options.display.float_format = "{:.4f}".format

/home/milia/miniforge3/envs/rapids-24.10/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

In [2]:

Copied!





file_id = "1FKZ9oHZ3zFMoFJX2f2aI34M2XZ2ikSb0"
if os.path.exists(ZIP_PATH):
    print("Архив уже добавлен")
else:
    gdown.download(
        f"https://drive.google.com/uc?id={file_id}",
        os.path.join(os.getcwd(), "dataset_32_classes.zip"),
        quiet=False,
    )

# Распаковка архива
with zipfile.ZipFile(ZIP_PATH, "r") as zip_ref:
    zip_ref.extractall("./dataset")

classes = os.listdir(DATASET_DIR)

# Проверим структуру папок
assert len(classes) == 32
print(f"Количество папок: {len(classes)}")
file_id = "1FKZ9oHZ3zFMoFJX2f2aI34M2XZ2ikSb0"
if os.path.exists(ZIP_PATH):
    print("Архив уже добавлен")
else:
    gdown.download(
        f"https://drive.google.com/uc?id={file_id}",
        os.path.join(os.getcwd(), "dataset_32_classes.zip"),
        quiet=False,
    )

# Распаковка архива
with zipfile.ZipFile(ZIP_PATH, "r") as zip_ref:
    zip_ref.extractall("./dataset")

classes = os.listdir(DATASET_DIR)

# Проверим структуру папок
assert len(classes) == 32
print(f"Количество папок: {len(classes)}")

from cuml.svm import SVC, SVR, LinearSVC import torch from rmm.allocators.torch import rmm_torch_allocator import rmm rmm.mr.set_current_device_resource(rmm.mr.CudaMemoryResource()) torch.cuda.memory.change_current_allocator(rmm_torch_allocator)

In [3]:

Copied!





print(torch.cuda.is_available())
if torch.cuda.is_available():
    print(torch.cuda.device_count())
    print(torch.cuda.current_device())
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

vgg16 = models.vgg16(weights=VGG16_Weights.DEFAULT)
if torch.cuda.is_available():
    vgg16 = vgg16.cuda()

# Убираем последний слой
feature_extractor = nn.Sequential(*list(vgg16.children())[:-1])
feature_extractor.cuda()
feature_extractor.eval()  # Установить в режим оценки
print(torch.cuda.is_available())
if torch.cuda.is_available():
    print(torch.cuda.device_count())
    print(torch.cuda.current_device())
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

vgg16 = models.vgg16(weights=VGG16_Weights.DEFAULT)
if torch.cuda.is_available():
    vgg16 = vgg16.cuda()

# Убираем последний слой
feature_extractor = nn.Sequential(*list(vgg16.children())[:-1])
feature_extractor.cuda()
feature_extractor.eval()  # Установить в режим оценки

True
1
0

Out[3]:

Sequential(
  (0): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (17): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (18): ReLU(inplace=True)
    (19): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (20): ReLU(inplace=True)
    (21): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (22): ReLU(inplace=True)
    (23): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (24): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (25): ReLU(inplace=True)
    (26): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (27): ReLU(inplace=True)
    (28): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (29): ReLU(inplace=True)
    (30): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (1): AdaptiveAvgPool2d(output_size=(7, 7))
)

Предварительная обработка для VGG16 (или ResNet50)

In [4]:

Copied!





preprocess = transforms.Compose(
    [
        # transforms.Resize((224, 224)),          # Изменить размер изображения
        transforms.Resize((128, 128)),
        transforms.ToTensor(),  # Преобразовать изображение в тензор
        transforms.Normalize(
            mean=[
                0.485,
                0.456,
                0.406,
            ],  # Нормализация с использованием статистики над датасетом ImageNet
            std=[0.229, 0.224, 0.225],
        ),
    ]
)
preprocess = transforms.Compose(
    [
        # transforms.Resize((224, 224)),          # Изменить размер изображения
        transforms.Resize((128, 128)),
        transforms.ToTensor(),  # Преобразовать изображение в тензор
        transforms.Normalize(
            mean=[
                0.485,
                0.456,
                0.406,
            ],  # Нормализация с использованием статистики над датасетом ImageNet
            std=[0.229, 0.224, 0.225],
        ),
    ]
)

параметры Normalize были взяты из документации pytorch

Функция предварительной обработки одного изображения

In [5]:

Copied!





def preprocess_image(image_path):
    try:
        image = Image.open(image_path).convert("RGB")
    except Exception as e:
        logging.error(f"error {e} for {path}")
        # image = Image.open(r".\dataset\Apple\10_10_100.jpg").convert("RGB")
    return preprocess(image).unsqueeze(0)
def preprocess_image(image_path):
    try:
        image = Image.open(image_path).convert("RGB")
    except Exception as e:
        logging.error(f"error {e} for {path}")
        # image = Image.open(r".\dataset\Apple\10_10_100.jpg").convert("RGB")
    return preprocess(image).unsqueeze(0)

Получение по N изображений каждого класса.

In [6]:

Copied!





def get_number_images(number, directory):
    return_images = []
    return_class = []
    for image_class in os.listdir(directory):
        temp_images = find_image_files(directory + "/" + image_class)
        return_images += temp_images[:number]
        return_class += [image_class] * number
    return return_images, return_class
def get_number_images(number, directory):
    return_images = []
    return_class = []
    for image_class in os.listdir(directory):
        temp_images = find_image_files(directory + "/" + image_class)
        return_images += temp_images[:number]
        return_class += [image_class] * number
    return return_images, return_class

Функция извлечения и вытаскивания параметров из изображений.

Для ускорения процесса испольузем вычисления на видеокарте, что сокращает время обработки с 1 часа до 4 минут при размерах изображения 224

In [7]:

Copied!





def extract_features_from_dataset(image_paths: str | list[str], feature_extractor=feature_extractor):
    all_features = None
    if isinstance(image_paths, str):
        image_paths = [image_paths]
    for path in image_paths:
        with torch.no_grad():
            input_tensor = preprocess_image(path)
            features = feature_extractor(input_tensor.cuda()).cuda()
            try:
                all_features = torch.cat((all_features, features.view(features.size(0), -1)), dim=0)
            except TypeError:
                all_features = features.view(features.size(0), -1)
    return all_features
def extract_features_from_dataset(image_paths: str | list[str], feature_extractor=feature_extractor):
    all_features = None
    if isinstance(image_paths, str):
        image_paths = [image_paths]
    for path in image_paths:
        with torch.no_grad():
            input_tensor = preprocess_image(path)
            features = feature_extractor(input_tensor.cuda()).cuda()
            try:
                all_features = torch.cat((all_features, features.view(features.size(0), -1)), dim=0)
            except TypeError:
                all_features = features.view(features.size(0), -1)
    return all_features

похоже из-за доступного кеша, пока tensor не разросся он реально имеет буст при обработке на видеокатре

In [8]:

Copied!





# def multi_extract_features_from_dataset(ndarray, names):
#     ans = []
#     t = 0
#     for i in range(ndarray, 0 , 400):
#         print(i)
#         ans += map(extract_features_from_dataset, ndarray[t:i+1])
#         t+=i
#     return ans, names


def multi_extract_features_from_dataset(
    ndarray, names
):  # похоже из-за доступного кеша, пока tensor  не разросся он реально имеет буст какой-то
    return list(map(extract_features_from_dataset, ndarray)), names
# def multi_extract_features_from_dataset(ndarray, names):
#     ans = []
#     t = 0
#     for i in range(ndarray, 0 , 400):
#         print(i)
#         ans += map(extract_features_from_dataset, ndarray[t:i+1])
#         t+=i
#     return ans, names


def multi_extract_features_from_dataset(
    ndarray, names
):  # похоже из-за доступного кеша, пока tensor  не разросся он реально имеет буст какой-то
    return list(map(extract_features_from_dataset, ndarray)), names

In [9]:

Copied!





image_number = 600
image_paths, labels = get_number_images(image_number, DATASET_DIR)
# image_paths = [item for item in zip(image_paths, labels)]
# dataset_features = extract_features_from_dataset(image_paths, feature_extractor)
# print("Dataset features shape:", dataset_features.shape)

N_CORES = 12  # количество задействованных ядер процессора

list_array = np.array_split(image_paths, N_CORES)
labels_array = np.array_split(labels, N_CORES)
data = Parallel(n_jobs=N_CORES, verbose=11)(
    delayed(multi_extract_features_from_dataset)(array, names) for array, names in zip(list_array, labels_array)
)
image_number = 600
image_paths, labels = get_number_images(image_number, DATASET_DIR)
# image_paths = [item for item in zip(image_paths, labels)]
# dataset_features = extract_features_from_dataset(image_paths, feature_extractor)
# print("Dataset features shape:", dataset_features.shape)

N_CORES = 12  # количество задействованных ядер процессора

list_array = np.array_split(image_paths, N_CORES)
labels_array = np.array_split(labels, N_CORES)
data = Parallel(n_jobs=N_CORES, verbose=11)(
    delayed(multi_extract_features_from_dataset)(array, names) for array, names in zip(list_array, labels_array)
)

[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   1 tasks      | elapsed:   39.5s
[Parallel(n_jobs=12)]: Done   3 out of  12 | elapsed:   41.5s remaining:  2.1min
[Parallel(n_jobs=12)]: Done   5 out of  12 | elapsed:   42.3s remaining:   59.2s
[Parallel(n_jobs=12)]: Done   7 out of  12 | elapsed:   43.1s remaining:   30.8s
[Parallel(n_jobs=12)]: Done   9 out of  12 | elapsed:   43.8s remaining:   14.6s
[Parallel(n_jobs=12)]: Done  12 out of  12 | elapsed:   44.8s finished

In [10]:

Copied!

import gc

gc.collect()
import gc

gc.collect()

Out[10]:

собираем тензоры в удобный вид

df = pd.DataFrame() names_df = pd.DataFrame() for job in data: class_names = job[1] names_df = pd.concat([names_df, pd.DataFrame(data=class_names, index=None, columns=None)], ignore_index=True) data = torch.cat(tuple(new_tensor for jobs in data for new_tensor in jobs[0]) , dim=0) # однострочник считающий все тензоры в 1 строку df = pd.DataFrame(data.cpu()) df.to_csv("./tensors.csv", encoding="utf-8-sig", index=False,) names_df.to_csv("./names_df.csv", encoding="utf-8-sig", index=False,) torch.cuda.empty_cache()names_df = pd.read_csv("./names_df.csv", encoding="utf-8-sig") names_df = names_df.rename(columns={'0': 0})# df = pd.read_csv("./tensors.csv", encoding="utf-8-sig") # быстрый способ, если столбцов очень много txt = open("./tensors.csv").readlines() df = pd.DataFrame(columns=[i for i in range(len(txt))]) txt = open("./tensors.csv").readlines() for i, ln in enumerate(txt): row_items = ln.split() df[i] = row_items

расчет на видеокарте с помощью RAPIDS

import rmm import cupy pool = rmm.mr.PoolMemoryResource(rmm.mr.CudaMemoryResource(), initial_pool_size=2**28, maximum_pool_size=2**32) rmm.mr.set_current_device_resource(pool) cupy.cuda.set_allocator(rmm.allocators.cupy.rmm_cupy_allocator)

In [11]:

Copied!





%%time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

import cuml
from cuml.svm import LinearSVC

encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(names_df[0])

# Split into training and test

X_train, X_test, y_train, y_test = train_test_split(df, encoded_labels, test_size=0.4, random_state=42)
cuml.DBSCAN(max_mbytes_per_batch=3000)

# Train an SVM
# svm = SVC(kernel='rbf', probability=True, C=1.0, cache_size = 3000)

svm = LinearSVC(loss="squared_hinge", penalty="l1", probability=True, C=1.0)
svm.fit(X_train.to_numpy(), y_train)

# Test the SVM
accuracy = svm.score(X_test, y_test)
print("SVM test accuracy:", accuracy)

accuracy = svm.score(X_train, y_train)
print("SVM train accuracy:", accuracy)

# Прогнозирование на тренировочных данных
y_pred_train = svm.predict(X_train)

# Оценка точности на тренировочных данных
train_accuracy = accuracy_score(y_train, y_pred_train)
print(f"Точность на тренировочных данных: {train_accuracy:.4f}")

# Матрица неточностей на тренировочных данных
cm_train = confusion_matrix(y_train, y_pred_train)
print("Матрица неточностей на тренировочных данных:")
print(cm_train)
%%time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

import cuml
from cuml.svm import LinearSVC

encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(names_df[0])

# Split into training and test

X_train, X_test, y_train, y_test = train_test_split(df, encoded_labels, test_size=0.4, random_state=42)
cuml.DBSCAN(max_mbytes_per_batch=3000)

# Train an SVM
# svm = SVC(kernel='rbf', probability=True, C=1.0, cache_size = 3000)

svm = LinearSVC(loss="squared_hinge", penalty="l1", probability=True, C=1.0)
svm.fit(X_train.to_numpy(), y_train)

# Test the SVM
accuracy = svm.score(X_test, y_test)
print("SVM test accuracy:", accuracy)

accuracy = svm.score(X_train, y_train)
print("SVM train accuracy:", accuracy)

# Прогнозирование на тренировочных данных
y_pred_train = svm.predict(X_train)

# Оценка точности на тренировочных данных
train_accuracy = accuracy_score(y_train, y_pred_train)
print(f"Точность на тренировочных данных: {train_accuracy:.4f}")

# Матрица неточностей на тренировочных данных
cm_train = confusion_matrix(y_train, y_pred_train)
print("Матрица неточностей на тренировочных данных:")
print(cm_train)

[W] [23:03:17.708637] QWL-QN: max iterations reached
[W] [23:03:17.708835] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
[W] [23:03:30.706130] QWL-QN: max iterations reached
[W] [23:03:30.706305] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
[W] [23:03:44.134167] QWL-QN: max iterations reached
[W] [23:03:44.134334] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
[W] [23:04:27.473269] QWL-QN: max iterations reached
[W] [23:04:27.473447] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
[W] [23:04:56.819098] QWL-QN: max iterations reached
[W] [23:04:56.819264] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
[W] [23:05:24.816835] QWL-QN: max iterations reached
[W] [23:05:24.817002] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
[W] [23:05:38.143960] QWL-QN: max iterations reached
[W] [23:05:38.144118] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
[W] [23:05:52.290765] QWL-QN: max iterations reached
[W] [23:05:52.290937] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
[W] [23:06:05.796730] QWL-QN: max iterations reached
[W] [23:06:05.796920] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
[W] [23:06:17.617096] QWL-QN: max iterations reached
[W] [23:06:17.617269] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
[W] [23:06:44.139395] QWL-QN: max iterations reached
[W] [23:06:44.139573] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
[W] [23:06:57.366305] QWL-QN: max iterations reached
[W] [23:06:57.366470] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
[W] [23:07:10.267341] QWL-QN: max iterations reached
[W] [23:07:10.267520] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
[W] [23:07:35.924690] QWL-QN: max iterations reached
[W] [23:07:35.924862] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
[W] [23:08:21.357464] QWL-QN: max iterations reached
[W] [23:08:21.357630] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
[W] [23:08:34.489716] QWL-QN: max iterations reached
[W] [23:08:34.489890] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
[W] [23:09:11.588510] QWL-QN: max iterations reached
[W] [23:09:11.588672] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
[W] [23:09:25.574546] QWL-QN: max iterations reached
[W] [23:09:25.574718] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
SVM test accuracy: 0.9669271111488342
SVM train accuracy: 1.0
Точность на тренировочных данных: 1.0000
Матрица неточностей на тренировочных данных:
[[361   0   0 ...   0   0   0]
 [  0 362   0 ...   0   0   0]
 [  0   0 377 ...   0   0   0]
 ...
 [  0   0   0 ... 354   0   0]
 [  0   0   0 ...   0 367   0]
 [  0   0   0 ...   0   0 375]]
CPU times: user 6min 21s, sys: 1.49 s, total: 6min 23s
Wall time: 6min 23s

SCV не удалось использовать т. к. в исходном коде есть алокация памяти, но нет ее освобождения. SVC в версии 24.12 и 25.02a еще не имеют исправления утечки памяти в реализации SVM issue

In [12]:

Copied!





encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(names_df[0])


X_train, X_test, y_train, y_test = train_test_split(df, encoded_labels, test_size=0.4, random_state=42)
cuml.DBSCAN(max_mbytes_per_batch=4000)
# Train an SVM
# svm = SVC(kernel='rbf', probability=True, C=1.0, cache_size = 3000)
svm = LinearSVC(loss="squared_hinge", penalty="l1", probability=True, C=1.0)
svm.fit(X_train.to_numpy(), y_train)

# Test the SVM
accuracy = svm.score(X_test, y_test)
print("SVM test accuracy:", accuracy)

accuracy = svm.score(X_train, y_train)
print("SVM train accuracy:", accuracy)

# Прогнозирование на тренировочных данных
y_pred_train = svm.predict(X_train)

# Оценка точности на тренировочных данных
train_accuracy = accuracy_score(y_train, y_pred_train)
print(f"Точность на тренировочных данных: {train_accuracy:.4f}")

# Матрица неточностей на тренировочных данных
cm_train = confusion_matrix(y_train, y_pred_train)
print("Матрица неточностей на тренировочных данных:")
print(cm_train)
encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(names_df[0])


X_train, X_test, y_train, y_test = train_test_split(df, encoded_labels, test_size=0.4, random_state=42)
cuml.DBSCAN(max_mbytes_per_batch=4000)
# Train an SVM
# svm = SVC(kernel='rbf', probability=True, C=1.0, cache_size = 3000)
svm = LinearSVC(loss="squared_hinge", penalty="l1", probability=True, C=1.0)
svm.fit(X_train.to_numpy(), y_train)

# Test the SVM
accuracy = svm.score(X_test, y_test)
print("SVM test accuracy:", accuracy)

accuracy = svm.score(X_train, y_train)
print("SVM train accuracy:", accuracy)

# Прогнозирование на тренировочных данных
y_pred_train = svm.predict(X_train)

# Оценка точности на тренировочных данных
train_accuracy = accuracy_score(y_train, y_pred_train)
print(f"Точность на тренировочных данных: {train_accuracy:.4f}")

# Матрица неточностей на тренировочных данных
cm_train = confusion_matrix(y_train, y_pred_train)
print("Матрица неточностей на тренировочных данных:")
print(cm_train)

In [13]:

Copied!

import pickle

filename = "LinearSVC_model.pickle"
pickle.dump(svm, open(filename, "wb"))
import pickle

filename = "LinearSVC_model.pickle"
pickle.dump(svm, open(filename, "wb"))

Итоги¶

Модель	Гиперпараметры	Размер изображения	Цветное	accuracy на трейне	accuracy на test	Время извлечения признаков датасета	Время обучения модели
LinearSVC GPU	C=1, loss='squared_hinge', penalty='l1'	128px	да	1.0	0.96	≈4 мин	≈6 мин
LinearSVC GPU	C=1, loss='squared_hinge', penalty='l1'	128px	да	1.0	0.6	≈4 мин	≈6 мин
SVC CPU	C=1, kernel='linear'	224px	да	1.0	0.96	≈4 мин	≈1.5 ч
SVC GPU	C=1, kernel='linear'	224px	да	?	?	≈error	≈error

Из-за нелинейно растущей (O(n^2)) сложности SVM желательно тренировать модель в google colab или на видеокарте