feat: HF /scan endpoint (#2566)

* start by checking /scan during the checksum update

Signed-off-by: Dave Lee <dave@gray101.com>

* add back in golang side features: downloader/uri gets struct and scan function, gallery uses it, and secscan/models calls it.

Signed-off-by: Dave Lee <dave@gray101.com>

* add a param to scan specific urls - useful for debugging

Signed-off-by: Dave Lee <dave@gray101.com>

* helpful printouts

Signed-off-by: Dave Lee <dave@gray101.com>

* fix offsets

Signed-off-by: Dave Lee <dave@gray101.com>

* fix error and naming

Signed-off-by: Dave Lee <dave@gray101.com>

* expose error

Signed-off-by: Dave Lee <dave@gray101.com>

* fix json tags

Signed-off-by: Dave Lee <dave@gray101.com>

* slight wording change

Signed-off-by: Dave Lee <dave@gray101.com>

* go mod tidy - getting warnings

Signed-off-by: Dave Lee <dave@gray101.com>

* split out python to make editing easier, add some simple code  to delete contaminated entries from gallery

Signed-off-by: Dave Lee <dave@gray101.com>

* o7 to my favorite part of our old name, go-skynet

Signed-off-by: Dave Lee <dave@gray101.com>

* merge fix

Signed-off-by: Dave Lee <dave@gray101.com>

* merge fix

Signed-off-by: Dave Lee <dave@gray101.com>

* merge fix

Signed-off-by: Dave Lee <dave@gray101.com>

* address review comments

Signed-off-by: Dave Lee <dave@gray101.com>

* forgot secscan could accept multiple URL at once

Signed-off-by: Dave Lee <dave@gray101.com>

* invert naming and actually use it

Signed-off-by: Dave Lee <dave@gray101.com>

* missed cli/models.go

Signed-off-by: Dave Lee <dave@gray101.com>

* Update .github/check_and_update.py

Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
Signed-off-by: Dave <dave@gray101.com>

---------

Signed-off-by: Dave Lee <dave@gray101.com>
Signed-off-by: Dave <dave@gray101.com>
Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
This commit is contained in:
Dave 2024-07-10 07:18:32 -04:00 committed by GitHub
parent cbb93bd8ec
commit 133987b1fb
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
15 changed files with 282 additions and 125 deletions

View file

@ -3,6 +3,8 @@ package downloader
import (
"crypto/sha256"
"encoding/base64"
"encoding/json"
"errors"
"fmt"
"io"
"net/http"
@ -129,6 +131,7 @@ func ConvertURL(s string) string {
// e.g. TheBloke/Mixtral-8x7B-v0.1-GGUF/mixtral-8x7b-v0.1.Q2_K.gguf@main -> https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q2_K.gguf
owner := strings.Split(repository, "/")[0]
repo := strings.Split(repository, "/")[1]
branch := "main"
if strings.Contains(repo, "@") {
branch = strings.Split(repository, "@")[1]
@ -353,3 +356,42 @@ func calculateSHA(filePath string) (string, error) {
return fmt.Sprintf("%x", hash.Sum(nil)), nil
}
type HuggingFaceScanResult struct {
RepositoryId string `json:"repositoryId"`
Revision string `json:"revision"`
HasUnsafeFiles bool `json:"hasUnsafeFile"`
ClamAVInfectedFiles []string `json:"clamAVInfectedFiles"`
DangerousPickles []string `json:"dangerousPickles"`
ScansDone bool `json:"scansDone"`
}
var ErrNonHuggingFaceFile = errors.New("not a huggingface repo")
var ErrUnsafeFilesFound = errors.New("unsafe files found")
func HuggingFaceScan(uri string) (*HuggingFaceScanResult, error) {
cleanParts := strings.Split(ConvertURL(uri), "/")
if len(cleanParts) <= 4 || cleanParts[2] != "huggingface.co" {
return nil, ErrNonHuggingFaceFile
}
results, err := http.Get(fmt.Sprintf("https://huggingface.co/api/models/%s/%s/scan", cleanParts[3], cleanParts[4]))
if err != nil {
return nil, err
}
if results.StatusCode != 200 {
return nil, fmt.Errorf("unexpected status code during HuggingFaceScan: %d", results.StatusCode)
}
scanResult := &HuggingFaceScanResult{}
bodyBytes, err := io.ReadAll(results.Body)
if err != nil {
return nil, err
}
err = json.Unmarshal(bodyBytes, scanResult)
if err != nil {
return nil, err
}
if scanResult.HasUnsafeFiles {
return scanResult, ErrUnsafeFilesFound
}
return scanResult, nil
}