Make WIN32 mmap() improvements (#341)

Still not fully working yet. Closes #341
1 year ago · e4881686b4
parent 0b5448a3a4
commit e4881686b4
5 changed files with 565 additions and 22 deletions
--- a/.gitignore
+++ b/.gitignore
@ -22,3 +22,412 @@ models/*

 arm_neon.h
 compile_commands.json
+CMakeFiles/
+CMakeCache.txt
+
+# Visual Studio stuff
+*.exe
+*.sln
+*.vcxproj
+*.vcxproj.filters
+cmake_install.cmake
+*.dir/
+
+## Ignore Visual Studio temporary files, build results, and
+## files generated by popular Visual Studio add-ons.
+##
+## Get latest from https://github.com/github/gitignore/blob/main/VisualStudio.gitignore
+
+# User-specific files
+*.rsuser
+*.suo
+*.user
+*.userosscache
+*.sln.docstates
+
+# User-specific files (MonoDevelop/Xamarin Studio)
+*.userprefs
+
+# Mono auto generated files
+mono_crash.*
+
+# Build results
+[Dd]ebug/
+[Dd]ebugPublic/
+[Rr]elease/
+[Rr]eleases/
+x64/
+x86/
+[Ww][Ii][Nn]32/
+[Aa][Rr][Mm]/
+[Aa][Rr][Mm]64/
+bld/
+[Bb]in/
+[Oo]bj/
+[Ll]og/
+[Ll]ogs/
+
+# Visual Studio 2015/2017 cache/options directory
+.vs/
+# Uncomment if you have tasks that create the project's static files in wwwroot
+#wwwroot/
+
+# Visual Studio 2017 auto generated files
+Generated\ Files/
+
+# MSTest test Results
+[Tt]est[Rr]esult*/
+[Bb]uild[Ll]og.*
+
+# NUnit
+*.VisualState.xml
+TestResult.xml
+nunit-*.xml
+
+# Build Results of an ATL Project
+[Dd]ebugPS/
+[Rr]eleasePS/
+dlldata.c
+
+# Benchmark Results
+BenchmarkDotNet.Artifacts/
+
+# .NET Core
+project.lock.json
+project.fragment.lock.json
+artifacts/
+
+# ASP.NET Scaffolding
+ScaffoldingReadMe.txt
+
+# StyleCop
+StyleCopReport.xml
+
+# Files built by Visual Studio
+*_i.c
+*_p.c
+*_h.h
+*.ilk
+*.meta
+*.obj
+*.iobj
+*.pch
+*.pdb
+*.ipdb
+*.pgc
+*.pgd
+*.rsp
+*.sbr
+*.tlb
+*.tli
+*.tlh
+*.tmp
+*.tmp_proj
+*_wpftmp.csproj
+*.log
+*.tlog
+*.vspscc
+*.vssscc
+.builds
+*.pidb
+*.svclog
+*.scc
+
+# Chutzpah Test files
+_Chutzpah*
+
+# Visual C++ cache files
+ipch/
+*.aps
+*.ncb
+*.opendb
+*.opensdf
+*.sdf
+*.cachefile
+*.VC.db
+*.VC.VC.opendb
+
+# Visual Studio profiler
+*.psess
+*.vsp
+*.vspx
+*.sap
+
+# Visual Studio Trace Files
+*.e2e
+
+# TFS 2012 Local Workspace
+$tf/
+
+# Guidance Automation Toolkit
+*.gpState
+
+# ReSharper is a .NET coding add-in
+_ReSharper*/
+*.[Rr]e[Ss]harper
+*.DotSettings.user
+
+# TeamCity is a build add-in
+_TeamCity*
+
+# DotCover is a Code Coverage Tool
+*.dotCover
+
+# AxoCover is a Code Coverage Tool
+.axoCover/*
+!.axoCover/settings.json
+
+# Coverlet is a free, cross platform Code Coverage Tool
+coverage*.json
+coverage*.xml
+coverage*.info
+
+# Visual Studio code coverage results
+*.coverage
+*.coveragexml
+
+# NCrunch
+_NCrunch_*
+.*crunch*.local.xml
+nCrunchTemp_*
+
+# MightyMoose
+*.mm.*
+AutoTest.Net/
+
+# Web workbench (sass)
+.sass-cache/
+
+# Installshield output folder
+[Ee]xpress/
+
+# DocProject is a documentation generator add-in
+DocProject/buildhelp/
+DocProject/Help/*.HxT
+DocProject/Help/*.HxC
+DocProject/Help/*.hhc
+DocProject/Help/*.hhk
+DocProject/Help/*.hhp
+DocProject/Help/Html2
+DocProject/Help/html
+
+# Click-Once directory
+publish/
+
+# Publish Web Output
+*.[Pp]ublish.xml
+*.azurePubxml
+# Note: Comment the next line if you want to checkin your web deploy settings,
+# but database connection strings (with potential passwords) will be unencrypted
+*.pubxml
+*.publishproj
+
+# Microsoft Azure Web App publish settings. Comment the next line if you want to
+# checkin your Azure Web App publish settings, but sensitive information contained
+# in these scripts will be unencrypted
+PublishScripts/
+
+# NuGet Packages
+*.nupkg
+# NuGet Symbol Packages
+*.snupkg
+# The packages folder can be ignored because of Package Restore
+**/[Pp]ackages/*
+# except build/, which is used as an MSBuild target.
+!**/[Pp]ackages/build/
+# Uncomment if necessary however generally it will be regenerated when needed
+#!**/[Pp]ackages/repositories.config
+# NuGet v3's project.json files produces more ignorable files
+*.nuget.props
+*.nuget.targets
+
+# Microsoft Azure Build Output
+csx/
+*.build.csdef
+
+# Microsoft Azure Emulator
+ecf/
+rcf/
+
+# Windows Store app package directories and files
+AppPackages/
+BundleArtifacts/
+Package.StoreAssociation.xml
+_pkginfo.txt
+*.appx
+*.appxbundle
+*.appxupload
+
+# Visual Studio cache files
+# files ending in .cache can be ignored
+*.[Cc]ache
+# but keep track of directories ending in .cache
+!?*.[Cc]ache/
+
+# Others
+ClientBin/
+~$*
+*~
+*.dbmdl
+*.dbproj.schemaview
+*.jfm
+*.pfx
+*.publishsettings
+orleans.codegen.cs
+
+# Including strong name files can present a security risk
+# (https://github.com/github/gitignore/pull/2483#issue-259490424)
+#*.snk
+
+# Since there are multiple workflows, uncomment next line to ignore bower_components
+# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
+#bower_components/
+
+# RIA/Silverlight projects
+Generated_Code/
+
+# Backup & report files from converting an old project file
+# to a newer Visual Studio version. Backup files are not needed,
+# because we have git ;-)
+_UpgradeReport_Files/
+Backup*/
+UpgradeLog*.XML
+UpgradeLog*.htm
+ServiceFabricBackup/
+*.rptproj.bak
+
+# SQL Server files
+*.mdf
+*.ldf
+*.ndf
+
+# Business Intelligence projects
+*.rdl.data
+*.bim.layout
+*.bim_*.settings
+*.rptproj.rsuser
+*- [Bb]ackup.rdl
+*- [Bb]ackup ([0-9]).rdl
+*- [Bb]ackup ([0-9][0-9]).rdl
+
+# Microsoft Fakes
+FakesAssemblies/
+
+# GhostDoc plugin setting file
+*.GhostDoc.xml
+
+# Node.js Tools for Visual Studio
+.ntvs_analysis.dat
+node_modules/
+
+# Visual Studio 6 build log
+*.plg
+
+# Visual Studio 6 workspace options file
+*.opt
+
+# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
+*.vbw
+
+# Visual Studio 6 auto-generated project file (contains which files were open etc.)
+*.vbp
+
+# Visual Studio 6 workspace and project file (working project files containing files to include in project)
+*.dsw
+*.dsp
+
+# Visual Studio 6 technical files
+*.ncb
+*.aps
+
+# Visual Studio LightSwitch build output
+**/*.HTMLClient/GeneratedArtifacts
+**/*.DesktopClient/GeneratedArtifacts
+**/*.DesktopClient/ModelManifest.xml
+**/*.Server/GeneratedArtifacts
+**/*.Server/ModelManifest.xml
+_Pvt_Extensions
+
+# Paket dependency manager
+.paket/paket.exe
+paket-files/
+
+# FAKE - F# Make
+.fake/
+
+# CodeRush personal settings
+.cr/personal
+
+# Python Tools for Visual Studio (PTVS)
+__pycache__/
+*.pyc
+
+# Cake - Uncomment if you are using it
+# tools/**
+# !tools/packages.config
+
+# Tabs Studio
+*.tss
+
+# Telerik's JustMock configuration file
+*.jmconfig
+
+# BizTalk build output
+*.btp.cs
+*.btm.cs
+*.odx.cs
+*.xsd.cs
+
+# OpenCover UI analysis results
+OpenCover/
+
+# Azure Stream Analytics local run output
+ASALocalRun/
+
+# MSBuild Binary and Structured Log
+*.binlog
+
+# NVidia Nsight GPU debugger configuration file
+*.nvuser
+
+# MFractors (Xamarin productivity tool) working folder
+.mfractor/
+
+# Local History for Visual Studio
+.localhistory/
+
+# Visual Studio History (VSHistory) files
+.vshistory/
+
+# BeatPulse healthcheck temp database
+healthchecksdb
+
+# Backup folder for Package Reference Convert tool in Visual Studio 2017
+MigrationBackup/
+
+# Ionide (cross platform F# VS Code tools) working folder
+.ionide/
+
+# Fody - auto-generated XML schema
+FodyWeavers.xsd
+
+# VS Code files for those working on multiple tools
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+*.code-workspace
+
+# Local History for Visual Studio Code
+.history/
+
+# Windows Installer files from build outputs
+*.cab
+*.msi
+*.msix
+*.msm
+*.msp
+
+# JetBrains Rider
+*.sln.iml
--- a/ggml.c
+++ b/ggml.c
@ -2,7 +2,7 @@
 #include "ggml.h"

 #if defined(_MSC_VER) || defined(__MINGW32__)
-#include <malloc.h> // using malloc.h with MSC/MINGW
+//#include <malloc.h> // using malloc.h with MSC/MINGW
 #elif !defined(__FreeBSD__) && !defined(__NetBSD__)
 #include <alloca.h>
 #endif
@ -2437,7 +2437,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {

    *ctx = (struct ggml_context) {
        /*.mem_size         =*/ params.mem_size,
-        /*.mem_buffer       =*/ params.mem_buffer ? params.mem_buffer : malloc(params.mem_size),
+        /*.mem_buffer       =*/ params.mem_buffer ? params.mem_buffer : _malloc(params.mem_size),
        /*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
        /*.n_objects        =*/ 0,
        /*.objects_begin    =*/ NULL,
@ -2469,7 +2469,7 @@ void ggml_free(struct ggml_context * ctx) {
                    __func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size);

            if (ctx->mem_buffer_owned) {
-                free(ctx->mem_buffer);
+                _free(ctx->mem_buffer);
            }

            found = true;
--- a/ggml.h
+++ b/ggml.h
@ -183,6 +183,9 @@ extern "C" {
 #define GGML_MAX_CONTEXTS 64
 #define GGML_MAX_OPT      4

+void* _malloc(size_t n);
+void _free(void* p);
+
 #ifdef __ARM_NEON
 // we use the built-in 16-bit float type
 typedef __fp16 ggml_fp16_t;
--- a/main.cpp
+++ b/main.cpp
@ -1,3 +1,7 @@
+#if defined(_MSC_VER) || defined(__MINGW32__)
+#define NOMINMAX
+#endif
+
 #include "ggml.h"

 #include "utils.h"
@ -19,6 +23,10 @@
 #include <unistd.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
+#else
+#include <errno.h>
+#define msync(addr, len_bytes, flag) winMSync
+#define MS_ASYNC 0
 #endif

 #define ROUNDUP(X, K) (((X) + (K)-1) & -(K))
@ -96,6 +104,7 @@ struct llama_model {
    std::map<std::string, struct ggml_tensor *> tensors;
 };

+
 struct magic {
    uint32_t magic;
    std::atomic<unsigned> lock;
@ -103,10 +112,37 @@ struct magic {
    size_t commit;
    size_t offset;
    size_t capacity;
-    gpt_vocab *vocab;
-    llama_model *model;
+    gpt_vocab* vocab;
+    llama_model* model;
 };

+static void winMSync(magic* addr, size_t len_bytes) {
+    bool success = FlushViewOfFile((void*)addr, len_bytes);
+    if (!success) {
+        LPVOID lpMsgBuf;
+        LPVOID lpDisplayBuf;
+        DWORD error_code = GetLastError();
+        FormatMessage(
+            FORMAT_MESSAGE_ALLOCATE_BUFFER |
+            FORMAT_MESSAGE_FROM_SYSTEM |
+            FORMAT_MESSAGE_IGNORE_INSERTS,
+            NULL,
+            error_code,
+            MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
+            (LPTSTR)&lpMsgBuf,
+            0, NULL);
+        lpDisplayBuf = (LPVOID)LocalAlloc(LMEM_ZEROINIT,
+            (lstrlen((LPCTSTR)lpMsgBuf) + 40) * sizeof(TCHAR));
+        StringCchPrintf((LPTSTR)lpDisplayBuf,
+            LocalSize(lpDisplayBuf) / sizeof(TCHAR),
+            TEXT("failed with error %d: %s"),
+            error_code, lpMsgBuf);
+    }
+    HANDLE hFile = (HANDLE)_get_osfhandle(addr->fd);
+    FlushFileBuffers(hFile);
+}
+
+
 static struct magic *mag;

 static inline void spin_lock(std::atomic<unsigned> &lock) {
@ -129,17 +165,26 @@ static void magic_commit(void) {
    mag->offset = mag->capacity;
    mag->commit = mag->capacity;
    mag->magic = 0xFEEDABEE;
-    msync(mag, mag->commit, MS_ASYNC);
+    bool success = msync(mag, mag->commit, MS_ASYNC);   
 }

 static void magic_init(void) {
    int fd;
    size_t n;
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
    struct stat st;
+#else
+    struct _stat64 st;
+#endif
    if (mag) return;
    n = ROUNDUP(sizeof(struct magic), MAGIC_GRAN);
    if ((fd = open(MAGIC_PATH, O_RDWR)) != -1) {
-        fstat(fd, &st);
+        int result = fstat(fd, &st);
+        int error = errno;
+        if (errno == EBADF)
+            fprintf(stderr, "Bad file descriptor.\n");
+        else if (errno == EINVAL)
+            fprintf(stderr, "Invalid argument to _fstat.\n");
        if (st.st_size >= n) {
            mag = (struct magic *)Mmap(MAGIC_ADDR, n,
                                       PROT_READ | PROT_WRITE,
@ -182,9 +227,9 @@ void *memalign(size_t a, size_t n) {
    i = i + sizeof(size_t);
    i = ROUNDUP(i, a);
    j = ROUNDUP(i + m, MAGIC_GRAN);
-    if (j > mag->capacity) {
+    //if (j > mag->capacity) {
        if (!mag->magic) {
-            ftruncate(mag->fd, j);
+            int result = ftruncate(mag->fd, j);
            p = mmap(MAGIC_ADDR + mag->capacity,
                     j - mag->capacity, PROT_READ | PROT_WRITE,
                     MAP_SHARED | MAP_FIXED, mag->fd, mag->capacity);
@ -199,7 +244,7 @@ void *memalign(size_t a, size_t n) {
            spin_unlock(mag->lock);
            return 0;
        }
-    }
+    //}
    mag->offset = i + m;
    spin_unlock(mag->lock);
    p = MAGIC_ADDR + i;
@ -207,7 +252,7 @@ void *memalign(size_t a, size_t n) {
    return p;
 }

-void *malloc(size_t n) {
+void *_malloc(size_t n) {
    return memalign(MAGIC_ALGN, n);
 }

@ -215,33 +260,53 @@ size_t malloc_usable_size(const void *p) {
    return ((const size_t *)p)[-1];
 }

-void *calloc(size_t n, size_t z) {
+void *_calloc(size_t n, size_t z) {
    void *p;
-    if ((p = malloc((n *= z)))) {
+    if ((p = _malloc((n *= z)))) {
        memset(p, 0, n);
    }
    return p;
 }

-void free(void *p) {
+void _free(void *p) {
    // do nothing
 }

-void *realloc(void *p, size_t n) {
+void *_realloc(void *p, size_t n) {
    void *q;
    if (!p) {
-        return malloc(n);
+        return _malloc(n);
    }
    if (!n) {
-        free(p);
+        _free(p);
        return 0;
    }
-    if ((q = malloc(n))) {
+    if ((q = _malloc(n))) {
        memcpy(q, p, ((const size_t *)p)[-1]);
    }
    return q;
 }

+#if defined(malloc)
+# undef malloc
+#endif
+#define malloc(x) _malloc(x)
+
+#if defined(calloc)
+# undef calloc
+#endif
+#define calloc(x) _calloc(x)
+
+#if defined(realloc)
+# undef realloc
+#endif
+#define realloc(x) _realloc(x)
+
+#if defined(free)
+# undef free
+#endif
+#define free(x) _free(x)
+
 // load the model's weights from a file
 bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx) {
    fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
@ -707,7 +772,7 @@ bool llama_eval(
    const int d_key = n_embd/n_head;

    static size_t buf_size = 512u*1024*1024;
-    static void * buf = malloc(buf_size);
+    static void * buf = _malloc(buf_size);

    if (mem_per_token > 0 && mem_per_token*N > buf_size) {
        const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
@ -715,7 +780,7 @@ bool llama_eval(

        // reallocate
        buf_size = buf_size_new;
-        buf = realloc(buf, buf_size);
+        buf = _realloc(buf, buf_size);
        if (buf == nullptr) {
            fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
            return false;
--- a/mmap.h
+++ b/mmap.h
@ -20,6 +20,7 @@
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #ifndef __MINGW32__
 #include <Windows.h>
+#include <strsafe.h>
 #else
 #include <windows.h>
 #endif
@ -101,7 +102,7 @@
 #define close _close
 #endif
 #ifndef fstat
-#define fstat _fstat
+#define fstat _fstati64
 #endif
 #ifndef madvise
 #define madvise WinMadvise
@ -113,6 +114,7 @@
 static std::atomic<unsigned> g_winlock;
 static std::map<LPVOID, HANDLE> g_winmap;

+
 static void WinLock(void) {
    while (!g_winlock.exchange(1, std::memory_order_acquire));
 }
@ -121,7 +123,30 @@ static void WunLock(void) {
    g_winlock.store(0, std::memory_order_release);
 }

-static int WinMadvise(int fd, size_t length, int flags) {
+static int WinMadvise(char* fd, size_t length, int flags) {
+    auto p_handle = GetCurrentProcess();
+    struct _WIN32_MEMORY_RANGE_ENTRY entry((void*)fd, length);
+    bool success = PrefetchVirtualMemory(p_handle, 1, &entry, 0);
+    if (!success) {
+        LPVOID lpMsgBuf;
+        LPVOID lpDisplayBuf;
+        DWORD error_code = GetLastError();
+        FormatMessage(
+            FORMAT_MESSAGE_ALLOCATE_BUFFER |
+            FORMAT_MESSAGE_FROM_SYSTEM |
+            FORMAT_MESSAGE_IGNORE_INSERTS,
+            NULL,
+            error_code,
+            MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
+            (LPTSTR)&lpMsgBuf,
+            0, NULL);
+        lpDisplayBuf = (LPVOID)LocalAlloc(LMEM_ZEROINIT,
+            (lstrlen((LPCTSTR)lpMsgBuf) + 256) * sizeof(TCHAR));
+        StringCchPrintf((LPTSTR)lpDisplayBuf,
+            LocalSize(lpDisplayBuf) / sizeof(TCHAR),
+            TEXT("%s failed with error %d: %s"),
+            error_code, lpMsgBuf);
+    }
    return 0;
 }

@ -215,11 +240,52 @@ static void *WinMap(void *addr, size_t length, int prot,
                             (offset + length) >> 32,
                             (offset + length), 0);
    if (!hand) {
+        LPVOID lpMsgBuf;
+        LPVOID lpDisplayBuf;
+        DWORD error_code = GetLastError();
+        FormatMessage(
+            FORMAT_MESSAGE_ALLOCATE_BUFFER |
+            FORMAT_MESSAGE_FROM_SYSTEM |
+            FORMAT_MESSAGE_IGNORE_INSERTS,
+            NULL,
+            error_code,
+            MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
+            (LPTSTR)&lpMsgBuf,
+            0, NULL);
+        lpDisplayBuf = (LPVOID)LocalAlloc(LMEM_ZEROINIT,
+            (lstrlen((LPCTSTR)lpMsgBuf) + 256) * sizeof(TCHAR));
+        StringCchPrintf((LPTSTR)lpDisplayBuf,
+            LocalSize(lpDisplayBuf) / sizeof(TCHAR),
+            TEXT("%s failed with error %d: %s"),
+            error_code, lpMsgBuf);
        return MAP_FAILED;
    }
+    if (winprot == PAGE_WRITECOPY) {
+        access = FILE_MAP_COPY;
+    }
+
    res = MapViewOfFileEx(hand, access, offset >> 32,
                          offset, length, addr);
    if (!res) {
+        LPVOID lpMsgBuf;
+        LPVOID lpDisplayBuf;
+        DWORD error_code = GetLastError();
+        FormatMessage(
+            FORMAT_MESSAGE_ALLOCATE_BUFFER |
+            FORMAT_MESSAGE_FROM_SYSTEM |
+            FORMAT_MESSAGE_IGNORE_INSERTS,
+            NULL,
+            error_code,
+            MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
+            (LPTSTR)&lpMsgBuf,
+            0, NULL);
+        lpDisplayBuf = (LPVOID)LocalAlloc(LMEM_ZEROINIT,
+            (lstrlen((LPCTSTR)lpMsgBuf) + 40) * sizeof(TCHAR));
+        StringCchPrintf((LPTSTR)lpDisplayBuf,
+            LocalSize(lpDisplayBuf) / sizeof(TCHAR),
+            TEXT("failed with error %d: %s"),
+            error_code, lpMsgBuf);
+        fprintf(stderr, (char*)lpDisplayBuf);
        CloseHandle(hand);
        return MAP_FAILED;
    }