-
Notifications
You must be signed in to change notification settings - Fork 58
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Support sharding in model builder #249
base: main
Are you sure you want to change the base?
Conversation
@@ -568,6 +611,10 @@ def make_add_bias(self, add, name, root_input, **kwargs): | |||
else: | |||
self.make_add(name, add_bias_inputs, dtype=self.io_dtype, shape=shape) | |||
|
|||
def make_all_reduce(self, name, root_input): | |||
output = f"{name}/output_0" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it possible to add the value info for the node's output?
@@ -1038,27 +1100,34 @@ def make_mlp_proj(self, layer_id, mlp, root_input): | |||
# Mul | |||
# | | |||
# DownProjMatMul | |||
if mlp is None: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why is this needed?
fc2_add_name = f"/model/layers.{layer_id}/mlp/fc2/Add" | ||
self.make_add_bias(mlp.fc2.bias.detach().numpy(), fc2_add_name, root_input=f"{fc2_matmul_name}/output_0") | ||
|
||
# Assign output 0 of MLP layer as output of last layer | ||
self.mlp_attrs["output_0"] = f"{fc2_add_name}/output_0" | ||
|
||
def make_block_sparse_moe(self, layer_id, bsm, root_input): | ||
if bsm is None: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you draw what the subgraph looks like as a comment? It will help to see it visually for documentation purposes.
w3_list.append(torch.reshape(bsm.experts[i].w3.weight, (hidden_size, inter_size))) | ||
|
||
moe_expert_1_name = f"model.layers.{layer_id}.moe.weight_1" | ||
moe_expert_2_name = f"model.layers.{layer_id}.moe.weight_2" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can the expert weight names be named as model.layers.{layer_id}.moe.experts.{expert_num}.weight
instead?
self.make_external_tensor(moe_experts_weight2.astype(self.to_numpy_dtype[self.io_dtype]), moe_expert_2_name) | ||
self.make_external_tensor(moe_experts_weight3.astype(self.to_numpy_dtype[self.io_dtype]), moe_expert_3_name) | ||
|
||
bias_ph = "" # Placeholder for bias |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
bias_ph = "" # Placeholder for bias | |
bias_name = "" # Placeholder for bias |
output = f"{moe_name}/output_0" | ||
if self.world_size > 1: | ||
self.make_node("ShardedMoE", inputs=inputs, outputs=[output], name=moe_name, domain="com.microsoft", | ||
k=top_k, activation_type=activation_type, normalize_routing_weights=normalize_routing_weights, tensor_shards=self.world_size) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can the logic to create an MoE node be factored out into one node creation function? Something like this:
op_type = f"{'Sharded' if self.world_size > 1 else ''}MoE"
kwargs = {"tensor_shards": self.world_size} if self.world_size > 1 else {}
self.make_node(
op_type, inputs=inputs, outputs=[output], name=moe_name, domain="com.microsoft",
k=top_k, activation_type=activation_type, normalize_routing_weights=normalize_routing_weights,
**kwargs,
)
@@ -1709,6 +1894,8 @@ def create_model(model_name, input_path, output_dir, precision, execution_provid | |||
onnx_model = MistralModel(config, io_dtype, precision, execution_provider, cache_dir, extra_options) | |||
elif config.architectures[0] == "PhiForCausalLM": | |||
onnx_model = PhiModel(config, io_dtype, precision, execution_provider, cache_dir, extra_options) | |||
elif config.architectures[0] == "MixtralForCausalLM": |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you add the new architecture such that the alphabetical order is maintained? This helps quickly identify which architectures are currently supported.
@@ -1801,6 +1988,8 @@ def get_args(): | |||
The filename for each component will be '<filename>_<component-name>.onnx' (ex: '<filename>_encoder.onnx', '<filename>_decoder.onnx'). | |||
config_only = Generate config and pre/post processing files only. | |||
Use this option when you already have your optimized and/or quantized ONNX model. | |||
world_size = Number of GPUs to use for distributed inference. Default is 1. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you add examples that use these extra options in the model builder README?
gate_reshape_name = f"/model/layers.{layer_id}/moe/gate/Reshape" | ||
self.make_reshape(gate_reshape_name, [f"{gate_name}/output_0", f"{concat_name}/output_0"], dtype=self.io_dtype, shape=['num_rows', num_experts]) | ||
|
||
moe_name = f"/model/layers.{layer_id}/moe" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you define this as basename
in the beginning and then modify the above node names to use it when defining their names (e.g. f"{basename}/gate/MatMul"
)? This allows us to change the basename if needed without needing to manually update all of the other node names in that subgraph.
Usage example:
python builder.py -m mistralai/Mixtral-8x7B-v0.1 -e cuda -p fp16 -o ./example-models/mixtral_rank_0 --extra_options world_size=2 rank=0
python builder.py -m mistralai/Mixtral-8x7B-v0.1 -e cuda -p fp16 -o ./example-models/mixtral_rank_1 --extra_options world_size=2 rank=1
We can expose this usage in Readme when the support of Multi-GPU inference in GenAI tool is ready.