HuggingFace镜像/deepseek-coder-33b-base
模型介绍文件和版本分析
下载使用量0

开放思维

import argparse

import torch
from openmind import is_torch_npu_available
from openmind import AutoTokenizer, AutoModelForCausalLM


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--model_name_or_path",
        type=str,
        help="Path to model",
        default="deepseek-coder-33b-base",
    )

    args = parser.parse_args()
    return args
def main():
    args = parse_args()
    if args.model_name_or_path:
        model_path = args.model_name_or_path
    else:
        model_path = "../"

    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        trust_remote_code=True,
        torch_dtype=torch.float16, 
        device_map="auto"
    )

    input_text = """<|fim▁begin|>def quick_sort(arr):
        if len(arr) <= 1:
            return arr
        pivot = arr[0]
        left = []
        right = []
    <|fim▁hole|>
            if arr[i] < pivot:
                left.append(arr[i])
            else:
                right.append(arr[i])
        return quick_sort(left) + [pivot] + quick_sort(right)<|fim▁end|>"""
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_length=128)
    print(tokenizer.decode(outputs[0], skip_special_tokens=True)[len(input_text):])

1. Deepseek Coder 简介

Deepseek Coder 是一系列代码语言模型,其训练数据包含 87% 的代码以及 13% 的中英文自然语言,每个模型均在 2T tokens 上进行预训练。我们提供从 1B 到 33B 等多种规模的代码模型。每个模型均在项目级代码语料上进行预训练,采用 16K 的窗口大小并引入额外的填空任务,以支持项目级代码补全和填充。在编码能力方面,Deepseek Coder 在多种编程语言和各类基准测试中,均达到了开源代码模型的领先水平。

  • 海量训练数据:训练数据规模达 2T tokens,其中包含 87% 的代码以及 13% 的中英文语言数据。

  • 高度灵活与可扩展:提供 1.3B、5.7B、6.7B 和 33B 等不同规模的模型,方便用户根据自身需求选择最适合的配置。

  • 卓越的模型性能:在 HumanEval、MultiPL-E、MBPP、DS-1000 和 APPS 等基准测试中,性能表现领先于其他公开可用的代码模型。

  • 先进的代码补全能力:采用 16K 窗口大小并结合填空任务,支持项目级代码补全和填充任务。

2. 模型概述

deepseek-coder-33b-base 是一个拥有 330 亿参数的模型,采用分组查询注意力(Grouped-Query Attention)机制,在 2 万亿 tokens 上完成训练。

  • 主页:DeepSeek
  • 代码仓库:deepseek-ai/deepseek-coder
  • 与 DeepSeek Coder 对话:DeepSeek-Coder

3. 使用方法

以下为模型使用示例。

1)代码补全

from modelscope import AutoTokenizer, AutoModelForCausalLM
import torch
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-coder-33b-base", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("deepseek-ai/deepseek-coder-33b-base", trust_remote_code=True).cuda()
input_text = "#write a quick sort algorithm"
inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_length=128)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

2)代码插入

from modelscope import AutoTokenizer, AutoModelForCausalLM
import torch
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-coder-33b-base", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("deepseek-ai/deepseek-coder-33b-base", trust_remote_code=True).cuda()
input_text = """<|fim▁begin|>def quick_sort(arr):
    if len(arr) <= 1:
        return arr
    pivot = arr[0]
    left = []
    right = []
<|fim▁hole|>
        if arr[i] < pivot:
            left.append(arr[i])
        else:
            right.append(arr[i])
    return quick_sort(left) + [pivot] + quick_sort(right)<|fim▁end|>"""
inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_length=128)
print(tokenizer.decode(outputs[0], skip_special_tokens=True)[len(input_text):])

3)仓库级代码补全

from modelscope import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-coder-33b-base", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("deepseek-ai/deepseek-coder-33b-base", trust_remote_code=True).cuda()

input_text = """#utils.py
import torch
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

def load_data():
    iris = datasets.load_iris()
    X = iris.data
    y = iris.target

    # Standardize the data
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Convert numpy data to PyTorch tensors
    X_train = torch.tensor(X_train, dtype=torch.float32)
    X_test = torch.tensor(X_test, dtype=torch.float32)
    y_train = torch.tensor(y_train, dtype=torch.int64)
    y_test = torch.tensor(y_test, dtype=torch.int64)
    
    return X_train, X_test, y_train, y_test

def evaluate_predictions(y_test, y_pred):
    return accuracy_score(y_test, y_pred)
#model.py
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

class IrisClassifier(nn.Module):
    def __init__(self):
        super(IrisClassifier, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(4, 16),
            nn.ReLU(),
            nn.Linear(16, 3)
        )

    def forward(self, x):
        return self.fc(x)

    def train_model(self, X_train, y_train, epochs, lr, batch_size):
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(self.parameters(), lr=lr)
        
        # Create DataLoader for batches
        dataset = TensorDataset(X_train, y_train)
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

        for epoch in range(epochs):
            for batch_X, batch_y in dataloader:
                optimizer.zero_grad()
                outputs = self(batch_X)
                loss = criterion(outputs, batch_y)
                loss.backward()
                optimizer.step()

    def predict(self, X_test):
        with torch.no_grad():
            outputs = self(X_test)
            _, predicted = outputs.max(1)
        return predicted.numpy()
#main.py
from utils import load_data, evaluate_predictions
from model import IrisClassifier as Classifier

def main():
    # Model training and evaluation
"""
inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=140)
print(tokenizer.decode(outputs[0]))

4. 许可证

本代码仓库采用 MIT 许可证授权。DeepSeek Coder 模型的使用受模型许可证约束。DeepSeek Coder 支持商业用途。

更多详情请参见 LICENSE-MODEL。

5. 联系方式

如有任何问题,请提交 issue 或通过邮箱 agi_code@deepseek.com 与我们联系。