Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
人
人工智能系统实战第三期
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Charles
人工智能系统实战第三期
Commits
0fc09fd4
Commit
0fc09fd4
authored
Dec 09, 2023
by
前钰
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Upload New File
parent
ec0185af
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
371 additions
and
0 deletions
+371
-0
model.py
...习项目实战/基于transformer的花朵识别/ViTFlowerClassification/model.py
+371
-0
No files found.
人工智能系统实战第三期/实战代码/深度学习项目实战/基于transformer的花朵识别/ViTFlowerClassification/model.py
0 → 100644
View file @
0fc09fd4
from
functools
import
partial
from
functools
import
partial
from
collections
import
OrderedDict
import
torch
import
torch.nn
as
nn
def
drop_path
(
x
,
drop_prob
:
float
=
0.
,
training
:
bool
=
False
):
"""https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 有讨论到关于Drop Connect的一些歧义。
此处采用了一种网友给出的优化方式,以下为其原注释内容:
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
'survival rate' as the argument.
"""
if
drop_prob
==
0.
or
not
training
:
return
x
keep_prob
=
1
-
drop_prob
shape
=
(
x
.
shape
[
0
],)
+
(
1
,)
*
(
x
.
ndim
-
1
)
# work with diff dim tensors, not just 2D ConvNets
random_tensor
=
keep_prob
+
torch
.
rand
(
shape
,
dtype
=
x
.
dtype
,
device
=
x
.
device
)
random_tensor
.
floor_
()
# binarize
output
=
x
.
div
(
keep_prob
)
*
random_tensor
return
output
class
DropPath
(
nn
.
Module
):
"""
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
"""
def
__init__
(
self
,
drop_prob
=
None
):
super
(
DropPath
,
self
)
.
__init__
()
self
.
drop_prob
=
drop_prob
def
forward
(
self
,
x
):
# 随机drop一个完整的block,
return
drop_path
(
x
,
self
.
drop_prob
,
self
.
training
)
class
PatchEmbed
(
nn
.
Module
):
"""
2D 图像 Patch Embedding
"""
def
__init__
(
self
,
image_size
=
224
,
patch_size
=
16
,
in_c
=
3
,
embed_dim
=
768
,
norm_layer
=
None
):
"""
Map input tensor to patch.
Args:
image_size: 输入图片的大小
patch_size: patch的大小
in_c: 输入的通道数
embed_dim: 编码维度. dimension = patch_size * patch_size * in_c
norm_layer: layer_normalization的方法
"""
super
()
.
__init__
()
image_size
=
(
image_size
,
image_size
)
patch_size
=
(
patch_size
,
patch_size
)
self
.
image_size
=
image_size
self
.
patch_size
=
patch_size
self
.
grid_size
=
(
image_size
[
0
]
//
patch_size
[
0
],
image_size
[
1
]
//
patch_size
[
1
])
self
.
num_patches
=
self
.
grid_size
[
0
]
*
self
.
grid_size
[
1
]
# 使用16x16的卷积核将输入的tensor切成多个patches
self
.
proj
=
nn
.
Conv2d
(
in_c
,
embed_dim
,
kernel_size
=
patch_size
,
stride
=
patch_size
)
self
.
norm
=
norm_layer
(
embed_dim
)
if
norm_layer
else
nn
.
Identity
()
def
forward
(
self
,
x
):
B
,
C
,
H
,
W
=
x
.
shape
assert
H
==
self
.
image_size
[
0
]
and
W
==
self
.
image_size
[
1
],
\
f
"Input image size ({H}*{W}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
# 打平: [B, C, H, W] -> [B, C, HW]
# 维度变换: [B, C, HW] -> [B, HW, C]
x
=
self
.
proj
(
x
)
.
flatten
(
2
)
.
transpose
(
1
,
2
)
x
=
self
.
norm
(
x
)
return
x
class
Attention
(
nn
.
Module
):
def
__init__
(
self
,
dim
,
# 输入token的dim
num_heads
=
8
,
# 多投资注意力的头数
qkv_bias
=
False
,
qk_scale
=
None
,
attn_drop_ratio
=
0.
,
proj_drop_ratio
=
0.
):
super
(
Attention
,
self
)
.
__init__
()
self
.
num_heads
=
num_heads
head_dim
=
dim
//
num_heads
self
.
scale
=
qk_scale
or
head_dim
**
-
0.5
# 根号d,缩放因子
self
.
qkv
=
nn
.
Linear
(
dim
,
dim
*
3
,
bias
=
qkv_bias
)
self
.
attn_drop
=
nn
.
Dropout
(
attn_drop_ratio
)
self
.
proj
=
nn
.
Linear
(
dim
,
dim
)
self
.
proj_drop
=
nn
.
Dropout
(
proj_drop_ratio
)
def
forward
(
self
,
x
):
"""
code#############################################################################
"""
return
x
class
MLP
(
nn
.
Module
):
def
__init__
(
self
,
in_features
,
hidden_features
=
None
,
out_features
=
None
,
act_layer
=
nn
.
GELU
,
drop
=
0.
):
super
()
.
__init__
()
out_features
=
out_features
or
in_features
hidden_features
=
hidden_features
or
in_features
self
.
fc1
=
nn
.
Linear
(
in_features
,
hidden_features
)
self
.
act
=
act_layer
()
self
.
fc2
=
nn
.
Linear
(
hidden_features
,
out_features
)
self
.
drop
=
nn
.
Dropout
(
drop
)
def
forward
(
self
,
x
):
x
=
self
.
fc1
(
x
)
x
=
self
.
act
(
x
)
x
=
self
.
drop
(
x
)
x
=
self
.
fc2
(
x
)
x
=
self
.
drop
(
x
)
return
x
class
Block
(
nn
.
Module
):
def
__init__
(
self
,
dim
,
num_heads
,
mlp_ratio
=
4.
,
qkv_bias
=
False
,
qk_scale
=
None
,
drop_ratio
=
0.
,
attn_drop_ratio
=
0.
,
drop_path_ratio
=
0.
,
act_layer
=
nn
.
GELU
,
norm_layer
=
nn
.
LayerNorm
):
super
(
Block
,
self
)
.
__init__
()
self
.
norm1
=
norm_layer
(
dim
)
self
.
attn
=
Attention
(
dim
,
num_heads
=
num_heads
,
qkv_bias
=
qkv_bias
,
qk_scale
=
qk_scale
,
attn_drop_ratio
=
attn_drop_ratio
,
proj_drop_ratio
=
drop_ratio
)
# NOTE: 对随即深度的路径丢弃
self
.
drop_path
=
DropPath
(
drop_path_ratio
)
if
drop_path_ratio
>
0.
else
nn
.
Identity
()
self
.
norm2
=
norm_layer
(
dim
)
mlp_hidden_dim
=
int
(
dim
*
mlp_ratio
)
self
.
mlp
=
MLP
(
in_features
=
dim
,
hidden_features
=
mlp_hidden_dim
,
act_layer
=
act_layer
,
drop
=
drop_ratio
)
def
forward
(
self
,
x
):
x
=
x
+
self
.
drop_path
(
self
.
attn
(
self
.
norm1
(
x
)))
x
=
x
+
self
.
drop_path
(
self
.
mlp
(
self
.
norm2
(
x
)))
return
x
class
VisionTransformer
(
nn
.
Module
):
def
__init__
(
self
,
image_size
=
224
,
patch_size
=
16
,
in_c
=
3
,
num_classes
=
1000
,
embed_dim
=
768
,
depth
=
12
,
num_heads
=
12
,
mlp_ratio
=
4.0
,
qkv_bias
=
True
,
qk_scale
=
None
,
representation_size
=
None
,
distilled
=
False
,
drop_ratio
=
0.
,
attn_drop_ratio
=
0.
,
drop_path_ratio
=
0.5
,
embed_layer
=
PatchEmbed
,
norm_layer
=
None
,
act_layer
=
None
):
"""
Args:
image_size (int, tuple): 输入图片的尺寸
patch_size (int, tuple): patch 尺寸
in_c (int): 输入的通道数
num_classes (int): 分类类别数
embed_dim (int): 编码维度, dim = patch_size * patch_size * in_c
depth (int): transformer深度
num_heads (int): 多头自注意力的头数
mlp_ratio (int): ratio of mlp hidden dim to embedding dim
qkv_bias (bool): 如果为True,则使用qkv的偏置
qk_scale (float): 更改原qk的缩放大小
representation_size (Optional[int]): enable and set representation layer (pre-logits) to this value if set
distilled (bool): 是否使用蒸馏法来优化模型(ViTal的变体模型:DeiT)
drop_ratio (float): dropout rate
attn_drop_ratio (float): attention dropout rate
drop_path_ratio (float): stochastic depth rate
embed_layer (nn.Module): patch embedding layer
norm_layer: (nn.Module): normalization layer
"""
super
(
VisionTransformer
,
self
)
.
__init__
()
self
.
num_classes
=
num_classes
self
.
num_features
=
self
.
embed_dim
=
embed_dim
# num_features for consistency with other models
self
.
num_tokens
=
2
if
distilled
else
1
norm_layer
=
norm_layer
or
partial
(
nn
.
LayerNorm
,
eps
=
1e-6
)
# partial类似闭包函数,第一个参数是函数对象,后面的参数是第一个函数对象的实参
act_layer
=
act_layer
or
nn
.
GELU
self
.
patch_embed
=
embed_layer
(
image_size
=
image_size
,
patch_size
=
patch_size
,
in_c
=
in_c
,
embed_dim
=
embed_dim
)
num_patches
=
self
.
patch_embed
.
num_patches
self
.
cls_token
=
nn
.
Parameter
(
torch
.
zeros
(
1
,
1
,
embed_dim
))
# nn.Parameter转换函数,把参数转换为可训练变量
self
.
dist_token
=
nn
.
Parameter
(
torch
.
zeros
(
1
,
1
,
embed_dim
))
if
distilled
else
None
self
.
pos_embed
=
nn
.
Parameter
(
torch
.
zeros
(
1
,
num_patches
+
self
.
num_tokens
,
embed_dim
))
self
.
pos_drop
=
nn
.
Dropout
(
p
=
drop_ratio
)
dpr
=
[
x
.
item
()
for
x
in
torch
.
linspace
(
0
,
drop_path_ratio
,
depth
)]
# stochastic depth decay rule
self
.
blocks
=
nn
.
Sequential
(
*
[
Block
(
dim
=
embed_dim
,
num_heads
=
num_heads
,
mlp_ratio
=
mlp_ratio
,
qkv_bias
=
qkv_bias
,
qk_scale
=
qk_scale
,
drop_ratio
=
drop_ratio
,
attn_drop_ratio
=
attn_drop_ratio
,
drop_path_ratio
=
dpr
[
i
],
norm_layer
=
norm_layer
,
act_layer
=
act_layer
)
for
i
in
range
(
depth
)
])
self
.
norm
=
norm_layer
(
embed_dim
)
# Representation layer
if
representation_size
and
not
distilled
:
self
.
has_logits
=
True
self
.
num_features
=
representation_size
self
.
pre_logits
=
nn
.
Sequential
(
OrderedDict
([
(
"fc"
,
nn
.
Linear
(
embed_dim
,
representation_size
)),
(
"act"
,
nn
.
Tanh
())
]))
else
:
self
.
has_logits
=
False
self
.
pre_logits
=
nn
.
Identity
()
# placeholder
# Classifier head(s)
self
.
head
=
nn
.
Linear
(
self
.
num_features
,
num_classes
)
if
num_classes
>
0
else
nn
.
Identity
()
self
.
head_dist
=
None
if
distilled
:
self
.
head_dist
=
nn
.
Linear
(
self
.
embed_dim
,
self
.
num_classes
)
if
num_classes
>
0
else
nn
.
Identity
()
# Weight init
nn
.
init
.
trunc_normal_
(
self
.
pos_embed
,
std
=
0.02
)
if
self
.
dist_token
is
not
None
:
nn
.
init
.
trunc_normal_
(
self
.
dist_token
,
std
=
0.02
)
nn
.
init
.
trunc_normal_
(
self
.
cls_token
,
std
=
0.02
)
self
.
apply
(
_init_vit_weights
)
def
forward_features
(
self
,
x
):
# [B, C, H, W] -> [B, num_patches, embed_dim]
x
=
self
.
patch_embed
(
x
)
# [B, 196, 768]
# [1, 1, 768] -> [B, 1, 768]
cls_token
=
self
.
cls_token
.
expand
(
x
.
shape
[
0
],
-
1
,
-
1
)
if
self
.
dist_token
is
None
:
x
=
torch
.
cat
((
cls_token
,
x
),
dim
=
1
)
# [B, 197, 768]
else
:
x
=
torch
.
cat
((
cls_token
,
self
.
dist_token
.
expand
(
x
.
shape
[
0
],
-
1
,
-
1
),
x
),
dim
=
1
)
x
=
self
.
pos_drop
(
x
+
self
.
pos_embed
)
x
=
self
.
blocks
(
x
)
x
=
self
.
norm
(
x
)
if
self
.
dist_token
is
None
:
return
self
.
pre_logits
(
x
[:,
0
])
else
:
return
x
[:,
0
],
x
[:,
1
]
def
forward
(
self
,
x
):
x
=
self
.
forward_features
(
x
)
if
self
.
head_dist
is
not
None
:
x
,
x_dist
=
self
.
head
(
x
[
0
]),
self
.
head_dist
(
x
[
1
])
if
self
.
training
and
not
torch
.
jit
.
is_scripting
():
return
x
,
x_dist
else
:
return
(
x
+
x_dist
)
/
2
else
:
x
=
self
.
head
(
x
)
return
x
def
_init_vit_weights
(
m
):
"""
ViT weight initialization
:param m: module
"""
if
isinstance
(
m
,
nn
.
Linear
):
nn
.
init
.
trunc_normal_
(
m
.
weight
,
std
=.
01
)
if
m
.
bias
is
not
None
:
nn
.
init
.
zeros_
(
m
.
bias
)
elif
isinstance
(
m
,
nn
.
Conv2d
):
nn
.
init
.
kaiming_normal_
(
m
.
weight
,
mode
=
"fan_out"
)
if
m
.
bias
is
not
None
:
nn
.
init
.
zeros_
(
m
.
bias
)
elif
isinstance
(
m
,
nn
.
LayerNorm
):
nn
.
init
.
zeros_
(
m
.
bias
)
nn
.
init
.
ones_
(
m
.
weight
)
def
vit_base_patch16_224_in21k
(
num_classes
:
int
=
21843
,
has_logits
:
bool
=
True
):
"""
ViT-Base model (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929).
ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer.
weights ported from official Google JAX impl:
https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_patch16_224_in21k-e5005f0a.pth
"""
model
=
VisionTransformer
(
image_size
=
224
,
patch_size
=
16
,
embed_dim
=
768
,
depth
=
12
,
num_heads
=
12
,
representation_size
=
768
if
has_logits
else
None
,
num_classes
=
num_classes
)
return
model
def
vit_base_patch32_224_in21k
(
num_classes
:
int
=
21843
,
has_logits
:
bool
=
True
):
"""
ViT-Base model (ViT-B/32) from original paper (https://arxiv.org/abs/2010.11929).
ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer.
weights ported from official Google JAX impl:
https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_patch32_224_in21k-8db57226.pth
"""
model
=
VisionTransformer
(
image_size
=
224
,
patch_size
=
32
,
embed_dim
=
768
,
depth
=
12
,
num_heads
=
12
,
representation_size
=
768
if
has_logits
else
None
,
num_classes
=
num_classes
)
return
model
def
vit_large_patch16_224_in21k
(
num_classes
:
int
=
21843
,
has_logits
:
bool
=
True
):
"""
ViT-Large model (ViT-L/16) from original paper (https://arxiv.org/abs/2010.11929).
ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer.
weights ported from official Google JAX impl:
https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_patch16_224_in21k-606da67d.pth
"""
model
=
VisionTransformer
(
image_size
=
224
,
patch_size
=
16
,
embed_dim
=
1024
,
depth
=
24
,
num_heads
=
16
,
representation_size
=
1024
if
has_logits
else
None
,
num_classes
=
num_classes
)
return
model
def
vit_large_patch32_224_in21k
(
num_classes
:
int
=
21843
,
has_logits
:
bool
=
True
):
"""
ViT-Large model (ViT-L/32) from original paper (https://arxiv.org/abs/2010.11929).
ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer.
weights ported from official Google JAX impl:
https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_patch32_224_in21k-9046d2e7.pth
"""
model
=
VisionTransformer
(
image_size
=
224
,
patch_size
=
32
,
embed_dim
=
1024
,
depth
=
24
,
num_heads
=
16
,
representation_size
=
1024
if
has_logits
else
None
,
num_classes
=
num_classes
)
return
model
def
vit_huge_patch14_224_in21k
(
num_classes
:
int
=
21843
,
has_logits
:
bool
=
True
):
"""
ViT-Huge model (ViT-H/14) from original paper (https://arxiv.org/abs/2010.11929).
ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer.
NOTE: converted weights not currently available, too large for github release hosting.
"""
model
=
VisionTransformer
(
image_size
=
224
,
patch_size
=
14
,
embed_dim
=
1280
,
depth
=
32
,
num_heads
=
16
,
representation_size
=
1280
if
has_logits
else
None
,
num_classes
=
num_classes
)
return
model
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment