-
Notifications
You must be signed in to change notification settings - Fork 325
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
GroundingDINO Model Error on GPU: Failure to Annotate with GPU Mode #389
Comments
我把我的onnxruntime-gpu从1.17.1降到1.16.0解决了这个问题,要根据你的cuda和cudnn版本来安装onnxruntime-gpu,详情参考https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html |
@Hermione-ff 谢谢,我将onnxruntime的版本调整为1.16.0,目前可以正常运行标注,但是仍然存在如下问题: |
我也有遇到類似的問題。目前猜測是sequence length 的問題,解法的話是透過 padding 到 max_length。 主要要更改兩個部份,第一個是生成 onnx ,第二個是模型本身。 生成 onnx使用以下程式: import argparse
import os
import os.path as osp
import onnxruntime as ort
import torch
from groundingdino.models import build_model
from groundingdino.models.GroundingDINO.bertwarper import (
generate_masks_with_special_tokens_and_transfer_map,
)
from groundingdino.util.slconfig import SLConfig
from groundingdino.util.utils import clean_state_dict
"""
The ONNX Export of the Grounding DINO
Written by Wei Wang (CVHub)
Usage:
1. git clone https://github.com/IDEA-Research/GroundingDINO.git
2. cd GroundingDINO and pip install -r requirements.txt
3. export PYTHONPATH=/path/to/your/GroundingDINO
4. Place the current script in this directory.
5. Download the corresponding tokenizer.json and place it in this dir.
6. Run the script.
```bash
python export_grounding_dino_onnx.py \
--config_file groundingdino/config/GroundingDINO_SwinB_cfg.py \
--ckpt_file /path/to/your/groundingdino_swinb_cogcoor.pth or groundingdino_swint_ogc \
--device 'cpu' or 'gpu'
```
"""
class Args:
def __init__(self, **kwargs):
self.__dict__.update(kwargs)
class OnnxBaseModel:
def __init__(self, model_path, device_type: str = "gpu"):
self.sess_opts = ort.SessionOptions()
if "OMP_NUM_THREADS" in os.environ:
self.sess_opts.inter_op_num_threads = int(os.environ["OMP_NUM_THREADS"])
self.providers = ["CPUExecutionProvider"]
if device_type.lower() == "gpu":
self.providers = ["CUDAExecutionProvider"]
self.ort_session = ort.InferenceSession(
model_path,
providers=self.providers,
sess_options=self.sess_opts,
)
def get_ort_inference(self, blob, inputs=None, extract=True, squeeze=False):
if inputs is None:
inputs = self.get_input_name()
outs = self.ort_session.run(None, {inputs: blob})
else:
outs = self.ort_session.run(None, inputs)
if extract:
outs = outs[0]
if squeeze:
outs = outs.squeeze(axis=0)
return outs
def get_input_name(self):
return self.ort_session.get_inputs()[0].name
def get_input_shape(self):
return self.ort_session.get_inputs()[0].shape
def get_output_name(self):
return [out.name for out in self.ort_session.get_outputs()]
def load_pt_model(model_config_path, model_checkpoint_path, cpu_only=False):
args = SLConfig.fromfile(model_config_path)
args.device = "cuda" if not cpu_only else "cpu"
# modified config
args.use_checkpoint = False
args.use_transformer_ckpt = False
model = build_model(args)
checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
_ = model.eval()
return model
def export_onnx(model, output_file):
caption = "the running dog ." # ". ".join(input_text)
tokenized = model.tokenizer([caption], padding="max_length", return_tensors="pt")
(
text_self_attention_masks,
position_ids,
_,
) = generate_masks_with_special_tokens_and_transfer_map(
tokenized, model.specical_tokens, model.tokenizer
)
input_ids = tokenized["input_ids"]
token_type_ids = tokenized["token_type_ids"]
attention_mask = tokenized["attention_mask"]
text_token_mask = text_self_attention_masks
img = torch.randn(1, 3, 800, 1200)
dynamic_axes = {
"input_ids": {0: "batch_size", 1: "seq_len"},
"attention_mask": {0: "batch_size", 1: "seq_len"},
"position_ids": {0: "batch_size", 1: "seq_len"},
"token_type_ids": {0: "batch_size", 1: "seq_len"},
"text_token_mask": {0: "batch_size", 1: "seq_len", 2: "seq_len"},
"img": {0: "batch_size", 2: "height", 3: "width"},
"logits": {0: "batch_size"},
"boxes": {0: "batch_size"},
}
args = (
img,
input_ids,
attention_mask,
position_ids,
token_type_ids,
text_token_mask,
)
input_names = [
"img",
"input_ids",
"attention_mask",
"position_ids",
"token_type_ids",
"text_token_mask",
]
output_names = ["logits", "boxes"]
# export onnx model
torch.onnx.export(
model,
f=output_file,
args=args,
input_names=input_names,
output_names=output_names,
dynamic_axes=dynamic_axes,
opset_version=16,
)
print("Done!")
if __name__ == "__main__":
parser = argparse.ArgumentParser("Export Grounding DINO Model to ONNX", add_help=True)
parser.add_argument(
"--config_file",
"-c",
type=str,
required=True,
help="path to config file",
)
parser.add_argument(
"--ckpt_file",
"-p",
type=str,
required=True,
help="path to checkpoint file",
)
parser.add_argument("--output_dir", "-o", type=str, help="output directory")
parser.add_argument(
"--img_path",
"-i",
type=str,
default="asset/demo2.jpg",
help="Test image",
)
parser.add_argument(
"--text_prompt",
"-t",
type=str,
default="The running dog",
help="Text prompt",
)
parser.add_argument(
"--device",
type=str,
default="cpu",
choices=["cpu", "gpu"],
help="Device",
)
parser.add_argument("--box_threshold", type=float, default=0.3, help="Box prediction score")
parser.add_argument("--text_threshold", type=float, default=0.25, help="Text prompt score")
args = parser.parse_args()
# cfg
config_file = args.config_file # change the path of the model config file
ckpt_file = args.ckpt_file # change the path of the model
output_dir = args.output_dir
img_path = args.img_path
text_prompt = args.text_prompt
device = args.device
box_threshold = args.box_threshold
text_threshold = args.text_threshold
onnx_file = osp.splitext(osp.basename(ckpt_file))[0] + ".onnx"
# make dir
if output_dir:
os.makedirs(output_dir, exist_ok=True)
onnx_file = osp.join(output_dir, onnx_file)
print(f"onnx_file = {onnx_file}")
if not osp.exists(onnx_file):
# load model
model = load_pt_model(config_file, ckpt_file, cpu_only=True)
# export model
export_onnx(model, onnx_file) 目前是簡單改寫,可以把官方的 quant 加上去,用法也一樣。 替換模型首先替換掉 @staticmethod
def get_tokenlizer(text_encoder_type="bert-base-uncased"):
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained(text_encoder_type)
return tokenizer 接著將 def preprocess(self, image, text_prompt, img_mask=None):
# encoder texts
captions = self.get_caption(str(text_prompt))
tokenized_raw_results = self.net.tokenizer([captions], padding="max_length", max_length=256, return_tensors="np")
tokenized = {
"input_ids": tokenized_raw_results.input_ids.astype(np.int64),
"token_type_ids": tokenized_raw_results.token_type_ids.astype(np.int64),
"attention_mask": tokenized_raw_results.attention_mask,
}
specical_tokens = [101, 102, 1012, 1029]
(
text_self_attention_masks,
position_ids,
_,
) = self.generate_masks_with_special_tokens_and_transfer_map(tokenized, specical_tokens)
if text_self_attention_masks.shape[1] > self.net.max_text_len:
text_self_attention_masks = text_self_attention_masks[
:, : self.net.max_text_len, : self.net.max_text_len
]
position_ids = position_ids[:, : self.net.max_text_len]
tokenized["input_ids"] = tokenized["input_ids"][:, : self.net.max_text_len]
tokenized["attention_mask"] = tokenized["attention_mask"][:, : self.net.max_text_len]
tokenized["token_type_ids"] = tokenized["token_type_ids"][:, : self.net.max_text_len]
inputs = {}
inputs["img"] = image
inputs["input_ids"] = np.array(tokenized["input_ids"], dtype=np.int64)
inputs["attention_mask"] = np.array(tokenized["attention_mask"], dtype=bool)
inputs["token_type_ids"] = np.array(tokenized["token_type_ids"], dtype=np.int64)
inputs["position_ids"] = np.array(position_ids, dtype=np.int64)
inputs["text_token_mask"] = np.array(text_self_attention_masks, dtype=bool)
return image, inputs, captions 最後將 def postprocess(self, outputs, caption, with_logits=True, token_spans=None):
logits, boxes = outputs
logits_filt = np.squeeze(logits, 0) # [0] # prediction_logits.shape = (nq, 256)
logits_filt = self.sig(logits_filt)
boxes_filt = np.squeeze(boxes, 0) # [0] # prediction_boxes.shape = (nq, 4)
# filter output
if token_spans is None:
filt_mask = logits_filt.max(axis=1) > self.box_threshold
logits_filt = logits_filt[filt_mask] # num_filt, 256
boxes_filt = boxes_filt[filt_mask] # num_filt, 4
# get phrase
tokenlizer = self.net.tokenizer
tokenized_raw_results = tokenlizer(caption, return_tensors="np")
tokenized = {
"input_ids": tokenized_raw_results.input_ids[0].astype(np.int64),
"token_type_ids": tokenized_raw_results.token_type_ids[0].astype(np.int64),
"attention_mask": tokenized_raw_results.attention_mask[0],
}
# build pred
pred_phrases = []
for logit in logits_filt:
posmap = logit > self.text_threshold
pred_phrase = self.get_phrases_from_posmap(posmap, tokenized, tokenlizer)
if with_logits:
pred_phrases.append([pred_phrase.strip(), logit.max()])
else:
pred_phrases.append([pred_phrase.strip(), 1.0])
else:
# TODO: Using token_spans.
raise NotImplementedError
return boxes_filt, pred_phrases 這樣應該可以解決問題。 |
大佬您好,有个bug麻烦您过目。
我在使用GoundingDINO(SwinB-QInt8)IDEA这个模型进行人的头部自动标注,在
__preferred_device__ = "CPU"
情况下的结果,可以标注且不会报错。我已经配置好了GPU运行的环境,并且在DAMO_YOLO等其它模型上也可以正常用GPU运行,但在GroundingDINO上使用GPU运行时,会出现以下报错:
在加载模型时:
� [ 0 ; 9 3 m 2 0 2 4 - 0 4 - 2 5 0 9 : 1 9 : 0 9 . 4 4 8 6 1 0 8 [ W : o n n x r u n t i m e : , t r a n s f o r m e r _ m e m c p y . c c : 7 4 o n n x r u n t i m e : : M e m c p y T r a n s f o r m e r : : A p p l y I m p l ] 1 2 6 4 M e m c p y n o d e s a r e a d d e d t o t h e g r a p h t o r c h _ j i t f o r C U D A E x e c u t i o n P r o v i d e r . I t m i g h t h a v e n e g a t i v e i m p a c t o n p e r f o r m a n c e ( i n c l u d i n g u n a b l e t o r u n C U D A g r a p h ) . S e t s e s s i o n _ o p t i o n s . l o g _ s e v e r i t y _ l e v e l = 1 t o s e e t h e d e t a i l l o g s b e f o r e t h i s m e s s a g e . � [ m � [ 0 ; 9 3 m 2 0 2 4 - 0 4 - 2 5 0 9 : 1 9 : 0 9 . 6 4 0 2 0 1 8 [ W : o n n x r u n t i m e : , s e s s i o n _ s t a t e . c c : 1 1 6 6 o n n x r u n t i m e : : V e r i f y E a c h N o d e I s A s s i g n e d T o A n E p ] S o m e n o d e s w e r e n o t a s s i g n e d t o t h e p r e f e r r e d e x e c u t i o n p r o v i d e r s w h i c h m a y o r m a y n o t h a v e a n n e g a t i v e i m p a c t o n p e r f o r m a n c e . e . g . O R T e x p l i c i t l y a s s i g n s s h a p e r e l a t e d o p s t o C P U t o i m p r o v e p e r f . � [ m � [ 0 ; 9 3 m 2 0 2 4 - 0 4 - 2 5 0 9 : 1 9 : 0 9 . 6 4 0 5 9 2 9 [ W : o n n x r u n t i m e : , s e s s i o n _ s t a t e . c c : 1 1 6 8 o n n x r u n t i m e : : V e r i f y E a c h N o d e I s A s s i g n e d T o A n E p ] R e r u n n i n g w i t h v e r b o s e o u t p u t o n a n o n - m i n i m a l b u i l d w i l l s h o w n o d e a s s i g n m e n t s . � [ m
在自动标注时:
Error in predict_shapes: [ONNXRuntimeError] : 1 : FAIL : Non-zero status code returned while running Expand node. Name:'Expand_33526' Status Message: Expand_33526: left operand cannot broadcast on dim 2 LeftShape: {1,900,4}, RightShape: {1,900,256}
从而无法正常标注。我没有找到解决该问题的有效途径,恳请指点
The text was updated successfully, but these errors were encountered: