[ResNet18 DepthNet]
[Conv2d]
* K: kernel size = 3
* S: stride = 1
[Max Pooling]
* K: kernel size = 2
[ResidualBlock]
#1 Conv2d + BatchNorm + ReLU
#2 Conv2d + BatchNorm
#3 Concat (#1, #2, D=1)
#0 Input RGB image = 3, H, W
#1 [Depth Encoder]
#1 Conv2d (K=7, S=2) + BatchNorm + ReLU = 64, H/2, W/2
#2 Conv2d + BatchNorm + ReLU = 64, H/2, W/2
#3 ResidualBlock (#2) *2 = 64, H/2, W/2
#4 Max Pooling = 64, H/4, W/4
#5 ResidualBlock (#3 + #2) *2 = 128, H/4, W/4