PointNet

PointNet

Bibtex #

@inproceedings{qi2017pointnet,
  title={Pointnet: Deep learning on point sets for 3d classification and segmentation},
  author={Qi, Charles R and Su, Hao and Mo, Kaichun and Guibas, Leonidas J},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={652--660},
  year={2017}
}

Architecture #

Input transform
Learn a pointnet_accd1f28c319718dc876689590b016ea5e6fc6b5.svg affine transform matrix that is applied to all points to align them
mlp(64, 64)
shared MLP (multi layer perceptron) from (size 64 to 64)
Feature transform
Learn data-dependent feature alignment transformation pointnet_3bcbede9f0a2cee4ec4c6c2381475fb472b8adbd.svg and multiply with feature vectors
mlp(64, 128, 64)
shared MLPs (3) to process aligned features
max pooling
compute a vertical maximum over the feature vectors of all points to get a “global feature” vector
For Classification
use the global feature vector with mlp(512,256,k) to predict k different classes
For Segmenation
Use aligned feature vectors and concatenate the global feature vector to all of them.
  • Apply more mlps with the final one resulting in pointnet_444b96837644b700eccc6f59d1f01f1268e9654f.svg segmentation classes for each point

Implementation #

  • We define all layers up to and including the max pooling operation as the PointNetEncoder
    • PointNetEncoder produces the global feature vector

Model #

import numpy as np
import torch
from torch import nn
import torch.nn.functional as F


class TNet(nn.Module):
    def __init__(self, k):
        super().__init__()
        # NOTE Add layers: Convolutional k->64, 64->128, 128->1024 with
        #   corresponding batch norms and ReLU

        self.conv1 = nn.Conv1d(in_channels=k, out_channels=64, kernel_size=1)
        self.conv2 = nn.Conv1d(in_channels=64, out_channels=128, kernel_size=1)
        self.conv3 = nn.Conv1d(in_channels=128, out_channels=1024, kernel_size=1)


        # NOTE Add layers: Linear 1024->512, 512->256, 256->k^2 with
        #   corresponding batch norms and ReLU
        self.bn1 = nn.BatchNorm1d(64)
        self.bn2 = nn.BatchNorm1d(128)
        self.bn3 = nn.BatchNorm1d(1024)
        self.bn4 = nn.BatchNorm1d(512)
        self.bn5 = nn.BatchNorm1d(256)

        self.relu = nn.ReLU()

        self.fc1 = torch.nn.Linear(1024, 512)
        self.fc2 = torch.nn.Linear(512, 256)
        self.fc3 = torch.nn.Linear(256, k**2)

        self.register_buffer('identity', torch.from_numpy(np.eye(k).flatten().astype(np.float32)).view(1, k ** 2))
        self.k = k

    def forward(self, x):
        b = x.shape[0]

        # NOTE Pass input through layers, applying the same max operation as in
        #   PointNetEncoder
        layers = [
            self.conv1,
            self.bn1,
            self.relu,
            self.conv2,
            self.bn2,
            self.relu,
            self.conv3,
            self.bn3,
            self.relu,

            # NOTE maxpooling
            lambda x: torch.max(x, 2, keepdim=True)[0],
            lambda x: x.view(-1, 1024),

            self.fc1,
            self.bn4,
            self.relu,

            self.fc2,
            self.bn5,
            self.relu,

            self.fc3,
        ]

        for layer in layers:
            x = layer(x)


        # NOTE No batch norm and relu after the last Linear layer

        # Adding the identity to constrain the feature transformation matrix to
        # be close to orthogonal matrix
        identity = self.identity.repeat(b, 1)
        x = x + identity
        x = x.view(-1, self.k, self.k)
        return x


class PointNetEncoder(nn.Module):
    def __init__(self, return_point_features=False):
        super().__init__()

        # TODO Define convolution layers, batch norm layers, and ReLU
        self.conv1 = nn.Conv1d(in_channels=3, out_channels=64, kernel_size=1)
        self.conv2 = nn.Conv1d(in_channels=64, out_channels=128, kernel_size=1)
        self.conv3 = nn.Conv1d(in_channels=128, out_channels=1024, kernel_size=1)

        self.bn1 = nn.BatchNorm1d(64)
        self.bn2 = nn.BatchNorm1d(128)
        self.bn3 = nn.BatchNorm1d(1024)

        self.relu = nn.ReLU()

        self.input_transform_net = TNet(k=3)
        self.feature_transform_net = TNet(k=64)

        self.return_point_features = return_point_features

    def forward(self, x):
        num_points = x.shape[2]

        input_transform = self.input_transform_net(x)
        x = torch.bmm(x.transpose(2, 1), input_transform).transpose(2, 1)

        # NOTE: First layer: 3->64
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)

        feature_transform = self.feature_transform_net(x)
        x = torch.bmm(x.transpose(2, 1), feature_transform).transpose(2, 1)
        point_features = x

        # NOTE: Layers 2 and 3: 64->128, 128->1024
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu(x)

        x = self.conv3(x)
        x = self.bn3(x)
        x = self.relu(x)

        # This is the symmetric max operation
        x = torch.max(x, 2, keepdim=True)[0]
        x = x.view(-1, 1024)

        if self.return_point_features:
            x = x.view(-1, 1024, 1).repeat(1, 1, num_points)
            return torch.cat([x, point_features], dim=1)
        else:
            return x


class PointNetClassification(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.encoder = PointNetEncoder(return_point_features=False)
        # NOTE Add Linear layers, batch norms, dropout with p=0.3, and ReLU.
        #
        #   Batch Norms and ReLUs are used after all but the last layer Dropout
        #   is used only directly after the second Linear layer. The last Linear
        #   layer reduces the number of feature channels to num_classes (=k in
        #   the architecture visualization)

        self.fc1 = nn.Linear(1024, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, num_classes)
        self.dropout = nn.Dropout(0.3)
        self.bn1 = nn.BatchNorm1d(512)
        self.bn2 = nn.BatchNorm1d(256)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.encoder(x)
        # NOTE Pass output of encoder through your layers

        x = self.fc1(x)
        x = self.bn1(x)
        x = self.relu(x)

        x = self.fc2(x)
        x = self.dropout(x)
        x = self.bn2(x)
        x = self.relu(x)

        x = self.fc3(x)

        return x


class PointNetSegmentation(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.num_classes = num_classes
        self.encoder = PointNetEncoder(return_point_features=True)
        # NOTE: Define convolutions, batch norms, and ReLU

        self.conv1 = nn.Conv1d(in_channels=1088, out_channels=512, kernel_size=1)
        self.conv2 = nn.Conv1d(in_channels=512, out_channels=256, kernel_size=1)
        self.conv3 = nn.Conv1d(in_channels=256, out_channels=128, kernel_size=1)
        self.conv4 = nn.Conv1d(in_channels=128, out_channels=num_classes, kernel_size=1)

        self.bn1 = nn.BatchNorm1d(512)
        self.bn2 = nn.BatchNorm1d(256)
        self.bn3 = nn.BatchNorm1d(128)

        self.relu = nn.ReLU()


    def forward(self, x):
        x = self.encoder(x)
        # NOTE: Pass x through all layers, no batch norm or ReLU after the last conv layer

        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)

        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu(x)

        x = self.conv3(x)
        x = self.bn3(x)
        x = self.relu(x)

        x = self.conv4(x)

        x = x.transpose(2, 1).contiguous()
        return x

Inference #

Classification #

"""Utility for inference using trained networks"""

import torch

from exercise_2.data.shapenet import ShapeNetPoints
from exercise_2.model.pointnet import PointNetClassification


class InferenceHandlerPointNetClassification:
    """Utility for inference using trained PointNet network"""

    def __init__(self, ckpt):
        """
        :param ckpt: checkpoint path to weights of the trained network
        """
        self.model = PointNetClassification(ShapeNetPoints.num_classes)
        self.model.load_state_dict(torch.load(ckpt, map_location='cpu'))
        self.model.eval()

    def infer_single(self, points):
        """
        Infer class of the shape given its point cloud representation
        :param points: points of shape 3 x 1024
        :return: class category name for the point cloud, as predicted by the model
        """
        input_tensor = torch.from_numpy(points).float().unsqueeze(0)

        # NOTE: Predict class
        prediction = self.model(input_tensor)
        class_id = ShapeNetPoints.classes[torch.argmax(prediction, dim=1)]
        class_name = ShapeNetPoints.class_name_mapping[class_id]

        return class_name

Segmentation #

import torch

from exercise_2.data.shapenet_parts import ShapeNetParts
from exercise_2.model.pointnet import PointNetSegmentation


class InferenceHandlerPointNetSegmentation:
    """Utility for segmentation inference using trained PointNet network"""

    def __init__(self, ckpt):
        """
        :param ckpt: checkpoint path to weights of the trained network
        """
        self.model = PointNetSegmentation(ShapeNetParts.num_classes)
        self.model.load_state_dict(torch.load(ckpt, map_location='cpu'))
        self.model.eval()

    def infer_single(self, points):
        """
        Infer class of the shape given its point cloud representation
        :param points: points of shape 3 x 1024
        :return: part segmentation labels for the point cloud, as predicted by the model
        """
        input_tensor = torch.from_numpy(points).float().unsqueeze(0)
        prediction = torch.argmax(self.model(input_tensor)[0], dim=1)
        return prediction
Calendar October 22, 2023