/**
 * =========================================================================
 * File        : ia32.cpp
 * Project     : 0 A.D.
 * Description : C++ and inline asm implementations of IA-32 functions
 * =========================================================================
 */

// license: GPL; see lib/license.txt

#include "precompiled.h"
#include "ia32.h"

#include <string.h>
#include <stdio.h>
#include <vector>
#include <set>
#include <algorithm>

#include "lib/posix/posix.h"    // pthread
#include "lib/bits.h"
#include "lib/timer.h"
#include "lib/module_init.h"
#include "lib/sysdep/cpu.h"

#if !HAVE_MS_ASM && !HAVE_GNU_ASM
#error ia32.cpp needs inline assembly support!
#endif

//-----------------------------------------------------------------------------
// capability bits

// set by ia32_cap_init, referenced by ia32_cap
// treated as 128 bit field; order: std ecx, std edx, ext ecx, ext edx
// keep in sync with enum CpuCap!
static u32 ia32_caps[4];

static void ia32_cap_init()
{
    u32 regs[4];
    if(ia32_asm_cpuid(1, regs))
    {
        ia32_caps[0] = regs[ECX];
        ia32_caps[1] = regs[EDX];
    }
    if(ia32_asm_cpuid(0x80000001, regs))
    {
        ia32_caps[2] = regs[ECX];
        ia32_caps[3] = regs[EDX];
    }
}

bool ia32_cap(IA32Cap cap)
{
    const uint tbl_idx = cap >> 5;
    const uint bit_idx = cap & 0x1f;
    if(tbl_idx > 3)
    {
        DEBUG_WARN_ERR(ERR::INVALID_PARAM);
        return false;
    }
    return (ia32_caps[tbl_idx] & BIT(bit_idx)) != 0;
}


//-----------------------------------------------------------------------------
// CPU identification

static Ia32Vendor vendor;

Ia32Vendor ia32_Vendor()
{
    return vendor;
}

static void DetectVendor()
{
    u32 regs[4];
    if(!ia32_asm_cpuid(0, regs))
        return;

    // copy regs to string
    // note: 'strange' ebx,edx,ecx reg order is due to ModR/M encoding order.
    char vendor_str[13];
    u32* vendor_str_u32 = (u32*)vendor_str;
    vendor_str_u32[0] = regs[EBX];
    vendor_str_u32[1] = regs[EDX];
    vendor_str_u32[2] = regs[ECX];
    vendor_str[12] = '\0';  // 0-terminate

    if(!strcmp(vendor_str, "AuthenticAMD"))
        vendor = IA32_VENDOR_AMD;
    else if(!strcmp(vendor_str, "GenuineIntel"))
        vendor = IA32_VENDOR_INTEL;
    else
        DEBUG_WARN_ERR(ERR::CPU_UNKNOWN_VENDOR);
}


static uint model, family;
static uint generation;

uint ia32_Generation()
{
    return generation;
}

static void DetectSignature()
{
    u32 regs[4];
    if(!ia32_asm_cpuid(1, regs))
        DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
    model  = bits(regs[EAX], 4, 7);
    family = bits(regs[EAX], 8, 11);

    switch(family)
    {
    case 5:
    case 6:
    case 7:
        generation = family;
        break;
    case 0xF:
        generation = 8;
        break;
    default:
        debug_assert(0);
    }
}


//-----------------------------------------------------------------------------
// identifier string

// 3 calls x 4 registers x 4 bytes = 48
static char identifierString[48+1] = {'\0'};

const char* ia32_IdentifierString()
{
    return identifierString;
}

/// functor to remove substrings from the CPU identifier string
class StringStripper
{
    char* m_string;
    size_t m_max_chars;

public:
    StringStripper(char* string, size_t max_chars)
    : m_string(string), m_max_chars(max_chars)
    {
    }

    // remove all instances of substring from m_string
    void operator()(const char* substring)
    {
        const size_t substring_length = strlen(substring);
        for(;;)
        {
            char* substring_pos = strstr(m_string, substring);
            if(!substring_pos)
                break;
            const size_t substring_ofs = substring_pos -m_string;
            const size_t num_chars = m_max_chars -substring_ofs -substring_length;
            memmove(substring_pos, substring_pos+substring_length, num_chars);
        }
    }
};

static void DetectIdentifierString()
{
    // get brand string (if available)
    // note: ia32_asm_cpuid writes 4 u32s directly to identifierString -
    // be very careful with pointer arithmetic!
    u32* u32_string = (u32*)identifierString;
    bool have_brand_string = false;
    if(ia32_asm_cpuid(0x80000002, u32_string+0 ) &&
       ia32_asm_cpuid(0x80000003, u32_string+4) &&
       ia32_asm_cpuid(0x80000004, u32_string+8))
        have_brand_string = true;

    // note: we previously verified max_chars is long enough, so copying
    // short literals into it is safe.

    // fall back to manual detect of CPU type because either:
    // - CPU doesn't support brand string (we use a flag to indicate this
    //   rather than comparing against a default value because it is safer);
    // - the brand string is useless, e.g. "Unknown". this happens on
    //   some older boards whose BIOS reprograms the string for CPUs it
    //   doesn't recognize.
    if(!have_brand_string || strncmp(identifierString, "Unknow", 6) == 0)
    {
        if(vendor == IA32_VENDOR_AMD)
        {
            // everything else is either too old, or should have a brand string.
            if(family == 6)
            {
                if(model == 3 || model == 7)
                    SAFE_STRCPY(identifierString, "IA32_VENDOR_AMD Duron");
                else if(model <= 5)
                    SAFE_STRCPY(identifierString, "IA32_VENDOR_AMD Athlon");
                else
                {
                    if(ia32_cap(IA32_CAP_AMD_MP))
                        SAFE_STRCPY(identifierString, "IA32_VENDOR_AMD Athlon MP");
                    else
                        SAFE_STRCPY(identifierString, "IA32_VENDOR_AMD Athlon XP");
                }
            }
        }
        else if(vendor == IA32_VENDOR_INTEL)
        {
            // everything else is either too old, or should have a brand string.
            if(family == 6)
            {
                if(model == 1)
                    SAFE_STRCPY(identifierString, "Intel Pentium Pro");
                else if(model == 3 || model == 5)
                    SAFE_STRCPY(identifierString, "Intel Pentium II");
                else if(model == 6)
                    SAFE_STRCPY(identifierString, "Intel Celeron"); 
                else
                    SAFE_STRCPY(identifierString, "Intel Pentium III");
            }
        }
    }
    // identifierString already holds a valid brand string; pretty it up.
    else
    {
        const char* const undesired_strings[] = { "(tm)", "(TM)", "(R)", "CPU " };
        std::for_each(undesired_strings, undesired_strings+ARRAY_SIZE(undesired_strings),
            StringStripper(identifierString, ARRAY_SIZE(identifierString)));

        // note: Intel brand strings include a frequency, but we can't rely
        // on it because the CPU may be overclocked. we'll leave it in the
        // string to show measurement accuracy and if SpeedStep is active.
    }
}


//-----------------------------------------------------------------------------
// CPU frequency

// set scheduling priority and restore when going out of scope.
class ScopedSetPriority
{
    int m_old_policy;
    sched_param m_old_param;

public:
    ScopedSetPriority(int new_priority)
    {
        // get current scheduling policy and priority
        pthread_getschedparam(pthread_self(), &m_old_policy, &m_old_param);

        // set new priority
        sched_param new_param = {0};
        new_param.sched_priority = new_priority;
        pthread_setschedparam(pthread_self(), SCHED_FIFO, &new_param);
    }

    ~ScopedSetPriority()
    {
        // restore previous policy and priority.
        pthread_setschedparam(pthread_self(), m_old_policy, &m_old_param);
    }
};

// note: this function uses timer.cpp!get_time, which is implemented via
// whrt.cpp on Windows, which again calls ia32_Init. be careful that
// this function isn't called from there as well, else WHRT will be used
// before its init completes.
double ia32_ClockFrequency()
{
    // if the TSC isn't available, there's really no good way to count the
    // actual CPU clocks per known time interval, so bail.
    // note: loop iterations ("bogomips") are not a reliable measure due
    // to differing IPC and compiler optimizations.
    if(!ia32_cap(IA32_CAP_TSC))
        return -1.0;    // impossible value

    // increase priority to reduce interference while measuring.
    const int priority = sched_get_priority_max(SCHED_FIFO)-1;
    ScopedSetPriority ssp(priority);

    // note: no need to "warm up" cpuid - it will already have been
    // called several times by the time this code is reached.
    // (background: it's used in ia32_rdtsc() to serialize instruction flow;
    // the first call is documented to be slower on Intel CPUs)

    int num_samples = 16;
    // if clock is low-res, do less samples so it doesn't take too long.
    // balance measuring time (~ 10 ms) and accuracy (< 1 0/00 error -
    // ok for using the TSC as a time reference)
    if(timer_res() >= 1e-3)
        num_samples = 8;
    std::vector<double> samples(num_samples);

    for(int i = 0; i < num_samples; i++)
    {
        double dt;
        i64 dc; // i64 because VC6 can't convert u64 -> double,
                // and we don't need all 64 bits.

        // count # of clocks in max{1 tick, 1 ms}:
        // .. wait for start of tick.
        const double t0 = get_time();
        u64 c1; double t1;
        do
        {
            // note: get_time effectively has a long delay (up to 5 us)
            // before returning the time. we call it before ia32_rdtsc to
            // minimize the delay between actually sampling time / TSC,
            // thus decreasing the chance for interference.
            // (if unavoidable background activity, e.g. interrupts,
            // delays the second reading, inaccuracy is introduced).
            t1 = get_time();
            c1 = ia32_rdtsc();
        }
        while(t1 == t0);
        // .. wait until start of next tick and at least 1 ms elapsed.
        do
        {
            const double t2 = get_time();
            const u64 c2 = ia32_rdtsc();
            dc = (i64)(c2 -c1);
            dt = t2 -t1;
        }
        while(dt < 1e-3);

        // .. freq = (delta_clocks) / (delta_seconds);
        //    ia32_rdtsc/timer overhead is negligible.
        const double freq = dc / dt;
        samples[i] = freq;
    }

    std::sort(samples.begin(), samples.end());

    // median filter (remove upper and lower 25% and average the rest).
    // note: don't just take the lowest value! it could conceivably be
    // too low, if background processing delays reading c1 (see above).
    double sum = 0.0;
    const int lo = num_samples/4, hi = 3*num_samples/4;
    for(int i = lo; i < hi; i++)
        sum += samples[i];

    const double clock_frequency = sum / (hi-lo);
    return clock_frequency;
}


//-----------------------------------------------------------------------------
// processor topology
//-----------------------------------------------------------------------------

uint ia32_ApicId()
{
    u32 regs[4];
    if(!ia32_asm_cpuid(1, regs))
        DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
    const uint apicId = bits(regs[EBX], 24, 31);
    return apicId;
}


// OSes report hyperthreading units and cores as "processors". we need to
// drill down and find out the exact counts (for thread pool dimensioning
// and cache sharing considerations).
// note: Intel Appnote 485 (CPUID) assures uniformity of coresPerPackage and
// logicalPerCore.

static uint coresPerPackage = 0;
static uint logicalPerCore = 0;

static void DetectCoresPerPackage()
{
    u32 regs[4];

    coresPerPackage = 1;    // single-core unless..

    switch(vendor)
    {
    case IA32_VENDOR_INTEL:
        if(ia32_asm_cpuid(4, regs))
            coresPerPackage = bits(regs[EAX], 26, 31)+1;
        break;

    case IA32_VENDOR_AMD:
        if(ia32_asm_cpuid(0x80000008, regs))
            coresPerPackage = bits(regs[ECX], 0, 7)+1;
        break;
    }
}

static bool IsHyperthreadingCapable()
{
    // definitely not
    if(!ia32_cap(IA32_CAP_HT))
        return false;

    // AMD N-core systems falsely set the HT bit for compatibility reasons
    // (don't bother resetting it, might confuse callers)
    if(vendor == IA32_VENDOR_AMD && ia32_cap(IA32_CAP_AMD_CMP_LEGACY))
        return false;

    return true;
}

static void DetectLogicalPerCore()
{
    u32 regs[4];

    if(!IsHyperthreadingCapable())
    {
        logicalPerCore = 1;
        return;
    }

    if(!ia32_asm_cpuid(1, regs))
        DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
    const uint logicalPerPackage = bits(regs[EBX], 16, 23);
    // cores ought to be uniform WRT # logical processors
    debug_assert(logicalPerPackage % coresPerPackage == 0);
    logicalPerCore = logicalPerPackage / coresPerPackage;
}

// the above two functions give the maximum number of cores/logical units.
// however, some of them may actually be disabled by the BIOS!
// what we can do is to analyze the APIC IDs. they are allocated sequentially
// for all "processors". treating the IDs as variable-width bitfields
// (according to the number of cores/logical units present) allows
// determining the exact topology as well as number of packages.

// these are set by DetectProcessorTopology, called from ia32_Init.
static uint numPackages = 0;    // i.e. sockets; > 1 => true SMP system
static uint enabledCoresPerPackage = 0;
static uint enabledLogicalPerCore = 0;  // hyperthreading units

typedef std::vector<u8> Ids;
typedef std::set<u8> IdSet;

// add the currently running processor's APIC ID to a list of IDs.
static void StoreApicId(void* param)
{
    Ids* apicIds = (Ids*)param;
    apicIds->push_back(ia32_ApicId());
}


// field := a range of bits sufficient to represent <num_values> integers.
// for each id in apicIds: extract the value of the field at offset bit_pos
// and insert it into ids. afterwards, adjust bit_pos to the next field.
// used to gather e.g. all core IDs from all APIC IDs.
static void ExtractFieldsIntoSet(const Ids& apicIds, uint& bit_pos, uint num_values, IdSet& ids)
{
    const uint id_bits = ceil_log2(num_values);
    if(id_bits == 0)
        return;

    const uint mask = bit_mask(id_bits);

    for(size_t i = 0; i < apicIds.size(); i++)
    {
        const u8 apic_id = apicIds[i];
        const u8 field = (apic_id >> bit_pos) & mask;
        ids.insert(field);
    }

    bit_pos += id_bits;
}


// @return false if unavailable / no information can be returned.
static bool DetectProcessorTopologyViaApicIds()
{
    // old APIC (see ia32_ApicId for details)
    if(generation < 8)
        return false;

    // get the set of all APIC IDs
    Ids apicIds;
    // .. OS affinity support is missing or excludes us from some processors
    if(cpu_CallByEachCPU(StoreApicId, &apicIds) != INFO::OK)
        return false;
    // .. if IDs aren't unique, cpu_CallByEachCPU is broken.
    std::sort(apicIds.begin(), apicIds.end());
    debug_assert(std::unique(apicIds.begin(), apicIds.end()) == apicIds.end());

    // extract values from all 3 ID bitfields into separate sets
    uint bit_pos = 0;
    IdSet logicalIds;
    ExtractFieldsIntoSet(apicIds, bit_pos, logicalPerCore, logicalIds);
    IdSet coreIds;
    ExtractFieldsIntoSet(apicIds, bit_pos, coresPerPackage, coreIds);
    IdSet packageIds;
    ExtractFieldsIntoSet(apicIds, bit_pos, 0xFF, packageIds);

    // (the set cardinality is representative of all packages/cores since
    // their numbers are uniform across the system.)
    numPackages            = std::max((uint)packageIds.size(), 1u);
    enabledCoresPerPackage = std::max((uint)coreIds   .size(), 1u);
    enabledLogicalPerCore  = std::max((uint)logicalIds.size(), 1u);

    // note: even though APIC IDs are assigned sequentially, we can't make any
    // assumptions about the values/ordering because we get them according to
    // the CPU affinity mask, which is unknown.

    return true;
}


static void GuessProcessorTopologyViaOsCount()
{
    const int numProcessors = cpu_OsNumProcessors();

    // note: we cannot hope to always return correct results since disabled
    // cores/logical units cannot be distinguished from the situation of the
    // OS simply not reporting them as "processors". unfortunately this
    // function won't always only be called for older (#core = #logical = 1)
    // systems because DetectProcessorTopologyViaApicIds may fail due to
    // lack of OS support. what we'll do is assume nothing is disabled; this
    // is reasonable because we care most about #packages. it's fine to assume
    // more cores (without inflating the total #processors) because that
    // count only indicates memory barriers etc. ought to be used.
    enabledCoresPerPackage = coresPerPackage;
    enabledLogicalPerCore = logicalPerCore;

    const long numPackagesTimesLogical = numProcessors / coresPerPackage;
    debug_assert(numPackagesTimesLogical != 0); // otherwise processors didn't include cores, which would be stupid

    numPackages = numPackagesTimesLogical / logicalPerCore;
    if(!numPackages)    // processors didn't include logical units (reasonable)
        numPackages = numPackagesTimesLogical;
}


// determine how many CoresPerPackage and LogicalPerCore are
// actually enabled and also count numPackages.
static void DetectProcessorTopology()
{
    // authoritative, but requires newer CPU, and OS support.
    if(DetectProcessorTopologyViaApicIds())
        return; // success, we're done.

    GuessProcessorTopologyViaOsCount();
}


uint ia32_NumPackages()
{
#ifndef NDEBUG
    debug_assert(numPackages != 0);
#endif
    return (uint)numPackages;
}

uint ia32_CoresPerPackage()
{
#ifndef NDEBUG
    debug_assert(enabledCoresPerPackage != 0);
#endif
    return (uint)enabledCoresPerPackage;
}

uint ia32_LogicalPerCore()
{
#ifndef NDEBUG
    debug_assert(enabledLogicalPerCore != 0);
#endif
    return (uint)enabledLogicalPerCore;
}


//-----------------------------------------------------------------------------
// misc stateless functions

// this RDTSC implementation writes edx:eax to a temporary and returns that.
// rationale: this insulates against changing compiler calling conventions,
// at the cost of some efficiency.
// use ia32_asm_rdtsc_edx_eax instead if the return convention is known to be
// edx:eax (should be the case on all 32-bit x86).
u64 ia32_rdtsc_safe()
{
    u64 c;
#if HAVE_MS_ASM
    __asm
    {
        cpuid
        rdtsc
        mov         dword ptr [c], eax
        mov         dword ptr [c+4], edx
    }
#elif HAVE_GNU_ASM
    // note: we save+restore EBX to avoid xcode complaining about a
    // "PIC register" being clobbered, whatever that means.
    __asm__ __volatile__ (
        "pushl %%ebx; cpuid; popl %%ebx; rdtsc"
        : "=A" (c)
        : /* no input */
    : "ecx" /* cpuid clobbers eax..edx, but the rest are covered */);
#endif
    return c;
}


void ia32_DebugBreak()
{
#if HAVE_MS_ASM
    __asm int 3
        // note: this probably isn't necessary, since unix_debug_break
        // (SIGTRAP) is most probably available if HAVE_GNU_ASM.
        // we include it for completeness, though.
#elif HAVE_GNU_ASM
    __asm__ __volatile__ ("int $3");
#endif
}


// enforce strong memory ordering.
void ia32_MemoryFence()
{
    // Pentium IV
    if(ia32_cap(IA32_CAP_SSE2))
#if HAVE_MS_ASM
        __asm mfence
#elif HAVE_GNU_ASM
        __asm__ __volatile__ ("mfence");
#endif
}

void ia32_Serialize()
{
#if HAVE_MS_ASM
    __asm cpuid
#elif HAVE_GNU_ASM
    __asm__ __volatile__ ("cpuid");
#endif
}


// checks if there is an IA-32 CALL instruction right before ret_addr.
// returns INFO::OK if so and ERR::FAIL if not.
// also attempts to determine the call target. if that is possible
// (directly addressed relative or indirect jumps), it is stored in
// target, which is otherwise 0.
//
// this is useful for walking the stack manually.
LibError ia32_GetCallTarget(void* ret_addr, void** target)
{
    *target = 0;

    // points to end of the CALL instruction (which is of unknown length)
    const u8* c = (const u8*)ret_addr;
    // this would allow for avoiding exceptions when accessing ret_addr
    // close to the beginning of the code segment. it's not currently set
    // because this is really unlikely and not worth the trouble.
    const size_t len = ~0u;

    // CALL rel32 (E8 cd)
    if(len >= 5 && c[-5] == 0xE8)
    {
        *target = (u8*)ret_addr + *(i32*)(c-4);
        return INFO::OK;
    }

    // CALL r/m32 (FF /2)
    // .. CALL [r32 + r32*s]          => FF 14 SIB
    if(len >= 3 && c[-3] == 0xFF && c[-2] == 0x14)
        return INFO::OK;
    // .. CALL [disp32]               => FF 15 disp32
    if(len >= 6 && c[-6] == 0xFF && c[-5] == 0x15)
    {
        void* addr_of_target = *(void**)(c-4);
        // there are no meaningful checks we can perform: we're called from
        // the stack trace code, so ring0 addresses may be legit.
        // even if the pointer is 0, it's better to pass its value on
        // (may help in tracking down memory corruption)
        *target = *(void**)addr_of_target;
        return INFO::OK;
    }
    // .. CALL [r32]                  => FF 00-3F(!14/15)
    if(len >= 2 && c[-2] == 0xFF && c[-1] < 0x40 && c[-1] != 0x14 && c[-1] != 0x15)
        return INFO::OK;
    // .. CALL [r32 + r32*s + disp8]  => FF 54 SIB disp8
    if(len >= 4 && c[-4] == 0xFF && c[-3] == 0x54)
        return INFO::OK;
    // .. CALL [r32 + disp8]          => FF 50-57(!54) disp8
    if(len >= 3 && c[-3] == 0xFF && (c[-2] & 0xF8) == 0x50 && c[-2] != 0x54)
        return INFO::OK;
    // .. CALL [r32 + r32*s + disp32] => FF 94 SIB disp32
    if(len >= 7 && c[-7] == 0xFF && c[-6] == 0x94)
        return INFO::OK;
    // .. CALL [r32 + disp32]         => FF 90-97(!94) disp32
    if(len >= 6 && c[-6] == 0xFF && (c[-5] & 0xF8) == 0x90 && c[-5] != 0x94)
        return INFO::OK;
    // .. CALL r32                    => FF D0-D7                 
    if(len >= 2 && c[-2] == 0xFF && (c[-1] & 0xF8) == 0xD0)
        return INFO::OK;

    WARN_RETURN(ERR::CPU_UNKNOWN_OPCODE);
}


//-----------------------------------------------------------------------------

static ModuleInitState initState;

void ia32_Init()
{
    if(!ModuleShouldInitialize(&initState))
        return;

    ia32_asm_cpuid_init();

    ia32_cap_init();

    DetectVendor();
    DetectSignature();
    DetectIdentifierString();

    DetectCoresPerPackage();
    DetectLogicalPerCore();
    DetectProcessorTopology();
}


void ia32_Shutdown()
{
    if(!ModuleShouldShutdown(&initState))
        return;

    // nothing to do
}