
2024-04-25 03:38
彻底解决keras model.summary()或者layer.count_params()权重参数个数为负数问题举两个例子,一张(1024,1024,3)的图片,分别使用VGGNet,DenseNet网络。


Created on 2018年9月30日'''from keras import applications
import numpy as npimport cv2
image = cv2.imread("D:\\xxxx\\hashiqi.jpg")
image = cv2.resize(image,(1024,1024),interpolation = cv2.INTER_CUBIC)
x_train = np.expand_dims(image,axis=0)
y_train = np.array([0])
# (1024, 1024, 3)model = applications.VGG16(input_shape=(1024,1024,3),include_top=False,weights=None)
model = applications.VGG16(input_shape=(1024,1024,3),include_top=True,weights=None)
# 无全连接层总参数量: 14714688
# 有全连接层总参数量: 2183080744 可见权重都基本占用在全连接层all_params_memory = 0
all_feature_memory = 0
for layer in model.layers:#训练权重w占用的内存params_memory = layer.count_params()/(1024*1024) * 4print("训练权重w占用的内存:",,layer.count_params(),str(params_memory)+" M")all_params_memory = all_params_memory + params_memory#特征图占用内存feature_shape = layer.output_shapefeature_size = 1for i in range(1,len(feature_shape)):feature_size = feature_size*feature_shape[i]feature_memory = feature_size/(1024*1024) * 4print("特征图占用内存:",feature_shape,feature_size,str(feature_memory)+" M")all_feature_memory = all_feature_memory + feature_memory# 特征图占用内存: (None, 1024, 1024, 3) 3145728 12.0 M
# 训练权重w占用的内存: block1_conv1 1792 0.0068359375 M
# 特征图占用内存: (None, 1024, 1024, 64) 67108864 256.0 M
# 训练权重w占用的内存: block1_conv2 36928 0.140869140625 M
# 特征图占用内存: (None, 1024, 1024, 64) 67108864 256.0 M
# 训练权重w占用的内存: block1_pool 0 0.0 M
# 特征图占用内存: (None, 512, 512, 64) 16777216 64.0 M
# 训练权重w占用的内存: block2_conv1 73856 0.28173828125 M
# 特征图占用内存: (None, 512, 512, 128) 33554432 128.0 M
# 训练权重w占用的内存: block2_conv2 147584 0.56298828125 M
# 特征图占用内存: (None, 512, 512, 128) 33554432 128.0 M
# 训练权重w占用的内存: block2_pool 0 0.0 M
# 特征图占用内存: (None, 256, 256, 128) 8388608 32.0 M
# 训练权重w占用的内存: block3_conv1 295168 1.1259765625 M
# 特征图占用内存: (None, 256, 256, 256) 16777216 64.0 M
# 训练权重w占用的内存: block3_conv2 590080 2.2509765625 M
# 特征图占用内存: (None, 256, 256, 256) 16777216 64.0 M
# 训练权重w占用的内存: block3_conv3 590080 2.2509765625 M
# 特征图占用内存: (None, 256, 256, 256) 16777216 64.0 M
# 训练权重w占用的内存: block3_pool 0 0.0 M
# 特征图占用内存: (None, 128, 128, 256) 4194304 16.0 M
# 训练权重w占用的内存: block4_conv1 1180160 4.501953125 M
# 特征图占用内存: (None, 128, 128, 512) 8388608 32.0 M
# 训练权重w占用的内存: block4_conv2 2359808 9.001953125 M
# 特征图占用内存: (None, 128, 128, 512) 8388608 32.0 M
# 训练权重w占用的内存: block4_conv3 2359808 9.001953125 M
# 特征图占用内存: (None, 128, 128, 512) 8388608 32.0 M
# 训练权重w占用的内存: block4_pool 0 0.0 M
# 特征图占用内存: (None, 64, 64, 512) 2097152 8.0 M
# 训练权重w占用的内存: block5_conv1 2359808 9.001953125 M
# 特征图占用内存: (None, 64, 64, 512) 2097152 8.0 M
# 训练权重w占用的内存: block5_conv2 2359808 9.001953125 M
# 特征图占用内存: (None, 64, 64, 512) 2097152 8.0 M
# 训练权重w占用的内存: block5_conv3 2359808 9.001953125 M
# 特征图占用内存: (None, 64, 64, 512) 2097152 8.0 M
# 训练权重w占用的内存: block5_pool 0 0.0 M
# 特征图占用内存: (None, 32, 32, 512) 524288 2.0 M
# 训练权重w占用的内存: flatten 0 0.0 M
# 特征图占用内存: (None, 524288) 524288 2.0 M
# 训练权重w占用的内存: fc1 2147487744 8192.015625 M
# 特征图占用内存: (None, 4096) 4096 0.015625 M
# 训练权重w占用的内存: fc2 16781312 64 .015625 M
# 特征图占用内存: (None, 4096) 4096 0.015625 M
# 训练权重w占用的内存: predictions 4097000 15.628814697265625 M
# 特征图占用内存: (None, 1000) 1000 0.003814697265625 Mprint("网络权重W占用总内存:",str(all_params_memory)+" M")
print("网络特征图占用总内存:",str(all_feature_memory)+" M")
print("网络总消耗内存:",str(all_params_memory+all_feature_memory)+" M")
# 网络权重W占用总内存: 8327.79214477539 M
# 网络特征图占用总内存: 1216.0350647 M
# 网络总消耗内存: 9543.82720947 M


  • 模型权重参数
  • 模型所储存的中间变量


import numpy as np# model是我们在pytorch定义的神经网络层
# model.parameters()取出这个model所有的权重参数
para = sum([ for p in model.parameters()])


Sequential((conv_1): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(relu_1): ReLU(inplace)(conv_2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(relu_2): ReLU(inplace)(pool_2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)(conv_3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))


# 下面的type_size是4,因为我们的参数是float32也就是4B,4个字节print('Model {} : params: {:4f}M'.format(model._get_name(), para * type_size / 1000 / 1000))


Model Sequential : params: 0.450304M


# model是我们加载的模型
# input是实际中投入的input(Tensor)变量# 利用clone()去复制一个input,这样不会对input造成影响
input_ = input.clone()   
# 确保不需要计算梯度,因为我们的目的只是为了计算中间变量而已
input_.requires_grad_(requires_grad=False)mods = list(model.modules())
out_sizes = []for i in range(1, len(mods)):m = mods[i]# 注意这里,如果relu激活函数是inplace则不用计算if isinstance(m, nn.ReLU):  if m.inplace:continueout = m(input_)out_sizes.append(np.array(out.size()))input_ = outtotal_nums = 0
for i in range(len(out_sizes)):s = out_sizes[i]nums = += nums


# 打印两种,只有 forward 和 foreward、backward的情况
print('Model {} : intermedite variables: {:3f} M (without backward)'.format(model._get_name(), total_nums * type_size / 1000 / 1000))
print('Model {} : intermedite variables: {:3f} M (with backward)'.format(model._get_name(), total_nums * type_size*2 / 1000 / 1000))



Model Sequential : intermedite variables: 336.089600 M (without backward)
Model Sequential : intermedite variables: 672.179200 M (with backward)


# 模型显存占用监测函数
# model:输入的模型
# input:实际中需要输入的Tensor变量
# type_size 默认为 4 默认类型为 float32 def modelsize(model, input, type_size=4):para = sum([ for p in model.parameters()])print('Model {} : params: {:4f}M'.format(model._get_name(), para * type_size / 1000 / 1000))input_ = input.clone()input_.requires_grad_(requires_grad=False)mods = list(model.modules())out_sizes = []for i in range(1, len(mods)):m = mods[i]if isinstance(m, nn.ReLU):if m.inplace:continueout = m(input_)out_sizes.append(np.array(out.size()))input_ = outtotal_nums = 0for i in range(len(out_sizes)):s = out_sizes[i]nums = += numsprint('Model {} : intermedite variables: {:3f} M (without backward)'.format(model._get_name(), total_nums * type_size / 1000 / 1000))print('Model {} : intermedite variables: {:3f} M (with backward)'.format(model._get_name(), total_nums * type_size*2 / 1000 / 1000))








# 输入
input = torch.rand(1, 10)
# 假设我们有一个非常深的网络
layers = [nn.Linear(10, 10) for _ in range(1000)]
model = nn.Sequential(*layers)
output = model(input)


# 首先设置输入的input=>requires_grad=True
# 如果不设置可能会导致得到的gradient为0input = torch.rand(1, 10, requires_grad=True)
layers = [nn.Linear(10, 10) for _ in range(1000)]# 定义要计算的层函数,可以看到我们定义了两个
# 一个计算前500个层,另一个计算后500个层def run_first_half(*args):x = args[0]for layer in layers[:500]:x = layer(x)return xdef run_second_half(*args):x = args[0]for layer in layers[500:-1]:x = layer(x)return x# 我们引入新加的checkpoint
from torch.utils.checkpoint import checkpointx = checkpoint(run_first_half, input)
x = checkpoint(run_second_half, x)
# 最后一层单独调出来执行
x = layers[-1](x)
x.sum.backward()  # 这样就可以了


input = torch.rand(1, 10, requires_grad=True)
layers = [nn.Linear(10, 10) for _ in range(1000)]
model = nn.Sequential(*layers)from torch.utils.checkpoint import checkpoint_sequential# 分成两个部分
num_segments = 2
x = checkpoint_sequential(model, num_segments, input)
x.sum().backward()  # 这样就可以了




# 08-Jun-18-17:56:51-gpu_mem_profAt __main__ <module>: line 39                        Total Used Memory:399.4  Mb
At __main__ <module>: line 40                        Total Used Memory:992.5  Mb
+ __main__ <module>: line 40                         (1, 1, 682, 700)     1.82 M <class 'torch.Tensor'>
+ __main__ <module>: line 40                         (1, 3, 682, 700)     5.46 M <class 'torch.Tensor'>
At __main__ <module>: line 126                       Total Used Memory:1088.5 Mb
+ __main__ <module>: line 126                        (64, 64, 3, 3)       0.14 M <class 'torch.nn.parameter.Parameter'>
+ __main__ <module>: line 126                        (128, 64, 3, 3)      0.28 M <class 'torch.nn.parameter.Parameter'>
+ __main__ <module>: line 126                        (128, 128, 3, 3)     0.56 M <class 'torch.nn.parameter.Parameter'>
+ __main__ <module>: line 126                        (64, 3, 3, 3)        0.00 M <class 'torch.nn.parameter.Parameter'>
+ __main__ <module>: line 126                        (256, 256, 3, 3)     2.25 M <class 'torch.nn.parameter.Parameter'>
+ __main__ <module>: line 126                        (512, 256, 3, 3)     4.5 M <class 'torch.nn.parameter.Parameter'>
+ __main__ <module>: line 126                        (512, 512, 3, 3)     9.0 M <class 'torch.nn.parameter.Parameter'>
+ __main__ <module>: line 126                        (64,)                0.00 M <class 'torch.nn.parameter.Parameter'>
+ __main__ <module>: line 126                        (1, 3, 682, 700)     5.46 M <class 'torch.Tensor'>
+ __main__ <module>: line 126                        (128,)               0.00 M <class 'torch.nn.parameter.Parameter'>
+ __main__ <module>: line 126                        (256,)               0.00 M <class 'torch.nn.parameter.Parameter'>
+ __main__ <module>: line 126                        (512,)               0.00 M <class 'torch.nn.parameter.Parameter'>
+ __main__ <module>: line 126                        (3,)                 1.14 M <class 'torch.Tensor'>
+ __main__ <module>: line 126                        (256, 128, 3, 3)     1.12 M <class 'torch.nn.parameter.Parameter'>


import datetime
import linecache
import osimport gc
import pynvml
import torch
import numpy as npprint_tensor_sizes = True
last_tensor_sizes = set()
gpu_profile_fn = f'{}-gpu_mem_prof.txt'# if 'GPU_DEBUG' in os.environ:
#     print('profiling gpu usage to ', gpu_profile_fn)lineno = None
func_name = None
filename = None
module_name = None# fram = inspect.currentframe()
# func_name = fram.f_code.co_name
# filename = fram.f_globals["__file__"]
# ss = os.path.dirname(os.path.abspath(filename))
# module_name = fram.f_globals["__name__"]def gpu_profile(frame, event):# it is _about to_ execute (!)global last_tensor_sizesglobal lineno, func_name, filename, module_nameif event == 'line':try:# about _previous_ line (!)if lineno is not None:pynvml.nvmlInit()# handle = pynvml.nvmlDeviceGetHandleByIndex(int(os.environ['GPU_DEBUG']))handle = pynvml.nvmlDeviceGetHandleByIndex(0)meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)line = linecache.getline(filename, lineno)where_str = module_name+' '+func_name+':'+' line '+str(lineno)with open(gpu_profile_fn, 'a+') as f:f.write(f"At {where_str:<50}"f"Total Used Memory:{meminfo.used/1024**2:<7.1f}Mb\n")if print_tensor_sizes is True:for tensor in get_tensors():if not hasattr(tensor, 'dbg_alloc_where'):tensor.dbg_alloc_where = where_strnew_tensor_sizes = {(type(x), tuple(x.size()),*4/1024**2,x.dbg_alloc_where) for x in get_tensors()}for t, s, m, loc in new_tensor_sizes - last_tensor_sizes:f.write(f'+ {loc:<50} {str(s):<20} {str(m)[:4]} M {str(t):<10}\n')for t, s, m, loc in last_tensor_sizes - new_tensor_sizes:f.write(f'- {loc:<50} {str(s):<20} {str(m)[:4]} M {str(t):<10}\n')last_tensor_sizes = new_tensor_sizespynvml.nvmlShutdown()# save details about line _to be_ executedlineno = Nonefunc_name = frame.f_code.co_namefilename = frame.f_globals["__file__"]if (filename.endswith(".pyc") orfilename.endswith(".pyo")):filename = filename[:-1]module_name = frame.f_globals["__name__"]lineno = frame.f_linenoreturn gpu_profileexcept Exception as e:print('A exception occured: {}'.format(e))return gpu_profiledef get_tensors():for obj in gc.get_objects():try:if torch.is_tensor(obj):tensor = objelse:continueif tensor.is_cuda:yield tensorexcept Exception as e:print('A exception occured: {}'.format(e))

需要注意的是,linecache中的getlines只能读取缓冲过的文件,如果这个文件没有运行过则返回无效值。Python 的垃圾收集机制会在变量没有应引用的时候立马进行回收,但是为什么模型中计算的中间变量在执行结束后还会存在呢。既然都没有引用了为什么还会占用空间?






