diff --git a/CMakeLists.txt b/CMakeLists.txt index 60060e6..3e3dd14 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -48,7 +48,7 @@ configure_file(Config.h.in config.h) target_compile_definitions(osslsigncode PRIVATE HAVE_CONFIG_H=1) # set sources -target_sources(osslsigncode PRIVATE osslsigncode.c helpers.c msi.c pe.c cab.c cat.c appx.c) +target_sources(osslsigncode PRIVATE osslsigncode.c helpers.c utf.c msi.c pe.c cab.c cat.c appx.c script.c) if(NOT UNIX) target_sources(osslsigncode PRIVATE applink.c) endif(NOT UNIX) diff --git a/appx.c b/appx.c index c15696a..c3b4586 100644 --- a/appx.c +++ b/appx.c @@ -547,6 +547,8 @@ static PKCS7 *appx_pkcs7_extract(FILE_FORMAT_CTX *ctx) */ static int appx_remove_pkcs7(FILE_FORMAT_CTX *ctx, BIO *hash, BIO *outdata) { + uint8_t *data = NULL; + size_t dataSize; uint64_t cdOffset, noEntries = 0; ZIP_FILE *zip = ctx->appx_ctx->zip; ZIP_CENTRAL_DIRECTORY_ENTRY *entry = zipGetCDEntryByName(zip, CONTENT_TYPES_FILENAME); @@ -558,6 +560,12 @@ static int appx_remove_pkcs7(FILE_FORMAT_CTX *ctx, BIO *hash, BIO *outdata) printf("Not a valid .appx file: content types file missing\n"); return 1; /* FAILED */ } + /* read signature data */ + dataSize = zipReadFileDataByName(&data, ctx->appx_ctx->zip, APP_SIGNATURE_FILENAME); + if (dataSize <= 0) { + return 1; /* FAILED, no signature */ + } + OPENSSL_free(data); if (!appx_remove_ct_signature_entry(zip, entry)) { printf("Failed to remove signature entry\n"); return 1; /* FAILED */ diff --git a/cab.c b/cab.c index 5792fc7..6e6d10c 100644 --- a/cab.c +++ b/cab.c @@ -427,11 +427,16 @@ static int cab_remove_pkcs7(FILE_FORMAT_CTX *ctx, BIO *hash, BIO *outdata) size_t i, written, len; uint32_t tmp; uint16_t nfolders, flags; - char *buf = OPENSSL_malloc(SIZE_64K); + char *buf; /* squash the unused parameter warning */ (void)hash; + if (ctx->cab_ctx->sigpos == 0 || ctx->cab_ctx->siglen == 0 + || ctx->cab_ctx->sigpos > ctx->cab_ctx->fileend) { + return 1; /* FAILED, no signature */ + } + buf = OPENSSL_malloc(SIZE_64K); /* * u1 signature[4] 4643534D MSCF: 0-3 * u4 reserved1 00000000: 4-7 diff --git a/cmake/CMakeTest.cmake b/cmake/CMakeTest.cmake index 6083937..5e32c21 100644 --- a/cmake/CMakeTest.cmake +++ b/cmake/CMakeTest.cmake @@ -129,9 +129,9 @@ string(SUBSTRING ${sha256sum} 0 64 leafhash) enable_testing() -set(extensions_all "exe" "ex_" "msi" "256appx" "512appx" "cat") -set(extensions_nocat "exe" "ex_" "msi" "256appx" "512appx") -set(extensions_nocatappx "exe" "ex_" "msi") +set(extensions_all "exe" "ex_" "msi" "256appx" "512appx" "cat" "ps1" "psc1" "mof") +set(extensions_nocat "exe" "ex_" "msi" "256appx" "512appx" "ps1" "psc1" "mof") +set(extensions_nocatappx "exe" "ex_" "msi" "ps1" "psc1" "mof") set(formats "pem" "der") # Test 1 diff --git a/helpers.c b/helpers.c index d4d3123..0226c18 100644 --- a/helpers.c +++ b/helpers.c @@ -595,6 +595,9 @@ static int spc_indirect_data_content_create(u_char **blob, int *len, FILE_FORMAT void *hash; SpcIndirectDataContent *idc = SpcIndirectDataContent_new(); + if (!ctx->format->data_blob_get || !ctx->format->hash_length_get) { + return 0; /* FAILED */ + } if (ctx->format->md_get) { /* APPX file specific - use a hash algorithm specified in the AppxBlockMap.xml file */ mdtype = EVP_MD_nid(ctx->format->md_get(ctx)); diff --git a/msi.c b/msi.c index 3ee30f3..4a70e75 100644 --- a/msi.c +++ b/msi.c @@ -100,30 +100,6 @@ static const u_char msi_zeroes[] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; -typedef struct { - ASN1_INTEGER *a; - ASN1_OCTET_STRING *string; - ASN1_INTEGER *b; - ASN1_INTEGER *c; - ASN1_INTEGER *d; - ASN1_INTEGER *e; - ASN1_INTEGER *f; -} SpcSipInfo; - -DECLARE_ASN1_FUNCTIONS(SpcSipInfo) - -ASN1_SEQUENCE(SpcSipInfo) = { - ASN1_SIMPLE(SpcSipInfo, a, ASN1_INTEGER), - ASN1_SIMPLE(SpcSipInfo, string, ASN1_OCTET_STRING), - ASN1_SIMPLE(SpcSipInfo, b, ASN1_INTEGER), - ASN1_SIMPLE(SpcSipInfo, c, ASN1_INTEGER), - ASN1_SIMPLE(SpcSipInfo, d, ASN1_INTEGER), - ASN1_SIMPLE(SpcSipInfo, e, ASN1_INTEGER), - ASN1_SIMPLE(SpcSipInfo, f, ASN1_INTEGER), -} ASN1_SEQUENCE_END(SpcSipInfo) - -IMPLEMENT_ASN1_FUNCTIONS(SpcSipInfo) - typedef struct { u_char signature[8]; /* 0xd0, 0xcf, 0x11, 0xe0, 0xa1, 0xb1, 0x1a, 0xe1 */ u_char unused_clsid[16]; /* reserved and unused */ @@ -321,6 +297,10 @@ static FILE_FORMAT_CTX *msi_ctx_new(GLOBAL_OPTIONS *options, BIO *hash, BIO *out /* * Allocate and return SpcSipInfo object. + * Subject Interface Package (SIP) is an internal Microsoft API for + * transforming arbitrary files into a digestible stream. + * These ClassIDs are found in the indirect data section and identify + * the type of processor needed to validate the signature. * [out] p: SpcSipInfo data * [out] plen: SpcSipInfo data length * [in] ctx: structure holds input and output data (unused) @@ -328,7 +308,7 @@ static FILE_FORMAT_CTX *msi_ctx_new(GLOBAL_OPTIONS *options, BIO *hash, BIO *out */ static ASN1_OBJECT *msi_spc_sip_info_get(u_char **p, int *plen, FILE_FORMAT_CTX *ctx) { - const u_char msistr[] = { + const u_char SpcUUIDSipInfoMsi[] = { 0xf1, 0x10, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46 }; @@ -344,7 +324,7 @@ static ASN1_OBJECT *msi_spc_sip_info_get(u_char **p, int *plen, FILE_FORMAT_CTX ASN1_INTEGER_set(si->d, 0); ASN1_INTEGER_set(si->e, 0); ASN1_INTEGER_set(si->f, 0); - ASN1_OCTET_STRING_set(si->string, msistr, sizeof msistr); + ASN1_OCTET_STRING_set(si->string, SpcUUIDSipInfoMsi, sizeof SpcUUIDSipInfoMsi); *plen = i2d_SpcSipInfo(si, NULL); *p = OPENSSL_malloc((size_t)*plen); i2d_SpcSipInfo(si, p); @@ -638,9 +618,15 @@ static PKCS7 *msi_pkcs7_extract_to_nest(FILE_FORMAT_CTX *ctx) */ static int msi_remove_pkcs7(FILE_FORMAT_CTX *ctx, BIO *hash, BIO *outdata) { + MSI_ENTRY *ds; + /* squash the unused parameter warning */ (void)hash; + ds = msi_signatures_get(ctx->msi_ctx->dirent, NULL); + if (!ds) { + return 1; /* FAILED, no signature */ + } if (!msi_dirent_delete(ctx->msi_ctx->dirent, digital_signature_ex, sizeof digital_signature_ex)) { return 1; /* FAILED */ diff --git a/osslsigncode.c b/osslsigncode.c index 94310ac..e75b466 100644 --- a/osslsigncode.c +++ b/osslsigncode.c @@ -123,6 +123,18 @@ ASN1_SEQUENCE(SpcSpOpusInfo) = { IMPLEMENT_ASN1_FUNCTIONS(SpcSpOpusInfo) +ASN1_SEQUENCE(SpcSipInfo) = { + ASN1_SIMPLE(SpcSipInfo, a, ASN1_INTEGER), + ASN1_SIMPLE(SpcSipInfo, string, ASN1_OCTET_STRING), + ASN1_SIMPLE(SpcSipInfo, b, ASN1_INTEGER), + ASN1_SIMPLE(SpcSipInfo, c, ASN1_INTEGER), + ASN1_SIMPLE(SpcSipInfo, d, ASN1_INTEGER), + ASN1_SIMPLE(SpcSipInfo, e, ASN1_INTEGER), + ASN1_SIMPLE(SpcSipInfo, f, ASN1_INTEGER), +} ASN1_SEQUENCE_END(SpcSipInfo) + +IMPLEMENT_ASN1_FUNCTIONS(SpcSipInfo) + ASN1_SEQUENCE(SpcAttributeTypeAndOptionalValue) = { ASN1_SIMPLE(SpcAttributeTypeAndOptionalValue, type, ASN1_OBJECT), ASN1_OPT(SpcAttributeTypeAndOptionalValue, value, ASN1_ANY) @@ -2844,7 +2856,9 @@ static int check_attached_data(GLOBAL_OPTIONS *options) tmp_options->infile = options->outfile; tmp_options->cmd = CMD_VERIFY; - ctx = file_format_msi.ctx_new(tmp_options, NULL, NULL); + ctx = file_format_script.ctx_new(tmp_options, NULL, NULL); + if (!ctx) + ctx = file_format_msi.ctx_new(tmp_options, NULL, NULL); if (!ctx) ctx = file_format_pe.ctx_new(tmp_options, NULL, NULL); if (!ctx) @@ -4317,7 +4331,9 @@ int main(int argc, char **argv) DO_EXIT_1("Failed to create file: %s\n", options.outfile); } } - ctx = file_format_msi.ctx_new(&options, hash, outdata); + ctx = file_format_script.ctx_new(&options, hash, outdata); + if (!ctx) + ctx = file_format_msi.ctx_new(&options, hash, outdata); if (!ctx) ctx = file_format_pe.ctx_new(&options, hash, outdata); if (!ctx) @@ -4366,6 +4382,9 @@ int main(int argc, char **argv) DO_EXIT_0("Unsupported command: remove-signature\n"); } ret = ctx->format->remove_pkcs7(ctx, hash, outdata); + if (ret) { + DO_EXIT_0("Unable to remove existing signature\n"); + } if (ctx->format->update_data_size) { ctx->format->update_data_size(ctx, outdata, NULL); } diff --git a/osslsigncode.h b/osslsigncode.h index d5ff984..196eac6 100644 --- a/osslsigncode.h +++ b/osslsigncode.h @@ -82,6 +82,10 @@ #define PROVIDE_ASKPASS 1 #endif +#ifdef _MSC_VER +/* not WIN32, because strcasecmp exists in MinGW */ +#define strcasecmp _stricmp +#endif #ifdef WIN32 #define remove_file(filename) _unlink(filename) @@ -332,6 +336,18 @@ typedef struct { DECLARE_ASN1_FUNCTIONS(SpcSpOpusInfo) +typedef struct { + ASN1_INTEGER *a; + ASN1_OCTET_STRING *string; + ASN1_INTEGER *b; + ASN1_INTEGER *c; + ASN1_INTEGER *d; + ASN1_INTEGER *e; + ASN1_INTEGER *f; +} SpcSipInfo; + +DECLARE_ASN1_FUNCTIONS(SpcSipInfo) + typedef struct { ASN1_OBJECT *type; ASN1_TYPE *value; @@ -468,6 +484,8 @@ typedef struct { DECLARE_ASN1_FUNCTIONS(MsCtlContent) typedef struct file_format_st FILE_FORMAT; + +typedef struct script_ctx_st SCRIPT_CTX; typedef struct msi_ctx_st MSI_CTX; typedef struct pe_ctx_st PE_CTX; typedef struct cab_ctx_st CAB_CTX; @@ -478,6 +496,7 @@ typedef struct { FILE_FORMAT *format; GLOBAL_OPTIONS *options; union { + SCRIPT_CTX *script_ctx; MSI_CTX *msi_ctx; PE_CTX *pe_ctx; CAB_CTX *cab_ctx; @@ -486,6 +505,7 @@ typedef struct { }; } FILE_FORMAT_CTX; +extern FILE_FORMAT file_format_script; extern FILE_FORMAT file_format_msi; extern FILE_FORMAT file_format_pe; extern FILE_FORMAT file_format_cab; diff --git a/pe.c b/pe.c index 9fcea54..06d9edf 100644 --- a/pe.c +++ b/pe.c @@ -404,9 +404,9 @@ static PKCS7 *pe_pkcs7_extract_to_nest(FILE_FORMAT_CTX *ctx) */ static int pe_remove_pkcs7(FILE_FORMAT_CTX *ctx, BIO *hash, BIO *outdata) { - if (ctx->pe_ctx->sigpos == 0) { - printf("PE file does not have any signature\n"); - return 1; /* FAILED */ + if (ctx->pe_ctx->sigpos == 0 || ctx->pe_ctx->siglen == 0 + || ctx->pe_ctx->sigpos > ctx->pe_ctx->fileend) { + return 1; /* FAILED, no signature */ } /* Strip current signature */ ctx->pe_ctx->fileend = ctx->pe_ctx->sigpos; diff --git a/script.c b/script.c new file mode 100644 index 0000000..1324095 --- /dev/null +++ b/script.c @@ -0,0 +1,865 @@ +/* + * Script file support library + * + * Copyright (C) 2021-2024 Michał Trojnara + */ + +#include "osslsigncode.h" +#include "helpers.h" +#include "utf.h" + +typedef enum {comment_hash, comment_xml, comment_c, comment_not_found} comment_style; + +typedef struct { + const char *extension; + comment_style comment; +} SCRIPT_FORMAT; + +const SCRIPT_FORMAT supported_formats[] = { + {".ps1", comment_hash}, + {".ps1xml", comment_xml}, + {".psc1", comment_xml}, + {".psd1", comment_hash}, + {".psm1", comment_hash}, + {".cdxml", comment_xml}, + {".mof", comment_c}, + {NULL, comment_not_found}, +}; + +const char *signature_begin = "SIG # Begin signature block"; +const char *signature_end = "SIG # End signature block"; + +typedef struct { + const char *open; + const char *close; +} SCRIPT_COMMENT; + +const SCRIPT_COMMENT comment_text[] = { + [comment_hash] = {"# ", ""}, + [comment_xml] = {""}, + [comment_c] = {"/* ", " */"} +}; + +struct script_ctx_st { + const SCRIPT_COMMENT *comment_text; + int utf; + uint32_t sigpos; + uint32_t fileend; +}; + +#define LINE_MAX_LEN 100 + +/* FILE_FORMAT method prototypes */ +static FILE_FORMAT_CTX *script_ctx_new(GLOBAL_OPTIONS *options, BIO *hash, BIO *outdata); +static ASN1_OBJECT *script_spc_sip_info_get(u_char **p, int *plen, FILE_FORMAT_CTX *ctx); +static PKCS7 *script_pkcs7_contents_get(FILE_FORMAT_CTX *ctx, BIO *hash, const EVP_MD *md); +static int script_hash_length_get(FILE_FORMAT_CTX *ctx); +static int script_check_file(FILE_FORMAT_CTX *ctx, int detached); +static u_char *script_digest_calc(FILE_FORMAT_CTX *ctx, const EVP_MD *md); +static int script_verify_digests(FILE_FORMAT_CTX *ctx, PKCS7 *p7); +static PKCS7 *script_pkcs7_extract(FILE_FORMAT_CTX *ctx); +static PKCS7 *script_pkcs7_extract_to_nest(FILE_FORMAT_CTX *ctx); +static int script_remove_pkcs7(FILE_FORMAT_CTX *ctx, BIO *hash, BIO *outdata); +static int script_process_data(FILE_FORMAT_CTX *ctx, BIO *hash, BIO *outdata); +static PKCS7 *script_pkcs7_signature_new(FILE_FORMAT_CTX *ctx, BIO *hash); +static int script_append_pkcs7(FILE_FORMAT_CTX *ctx, BIO *outdata, PKCS7 *p7); +static BIO *script_bio_free(BIO *hash, BIO *outdata); +static void script_ctx_cleanup(FILE_FORMAT_CTX *ctx, BIO *hash, BIO *outdata); + +FILE_FORMAT file_format_script = { + .ctx_new = script_ctx_new, + .data_blob_get = script_spc_sip_info_get, + .pkcs7_contents_get = script_pkcs7_contents_get, + .hash_length_get = script_hash_length_get, + .check_file = script_check_file, + .digest_calc = script_digest_calc, + .verify_digests = script_verify_digests, + .pkcs7_extract = script_pkcs7_extract, + .pkcs7_extract_to_nest = script_pkcs7_extract_to_nest, + .remove_pkcs7 = script_remove_pkcs7, + .process_data = script_process_data, + .pkcs7_signature_new = script_pkcs7_signature_new, + .append_pkcs7 = script_append_pkcs7, + .bio_free = script_bio_free, + .ctx_cleanup = script_ctx_cleanup, +}; + +/* helper functions */ +static SCRIPT_CTX *script_ctx_get(char *indata, uint32_t filesize, const SCRIPT_COMMENT *comment, int utf); +static void write_commented(FILE_FORMAT_CTX *ctx, BIO *outdata, const char *data, size_t length); +static void write_in_encoding(FILE_FORMAT_CTX *ctx, BIO *outdata, const char *line, size_t length); +static size_t utf8_to_utf16(const char *data, size_t len, uint16_t **out_utf16); +static size_t utf16_to_utf8(const uint16_t *data, size_t len, char **out_utf8); +static BIO *script_digest_calc_bio(FILE_FORMAT_CTX *ctx, const EVP_MD *md); +static int script_digest_convert(BIO *hash, FILE_FORMAT_CTX *ctx, size_t len); +static int script_write_bio(BIO *data, char *indata, size_t len); + +/* + * Allocate and return a script file format context. + * [in, out] options: structure holds the input data + * [out] hash: message digest BIO + * [in] outdata: outdata file BIO (unused) + * [returns] pointer to script file format context + */ +static FILE_FORMAT_CTX *script_ctx_new(GLOBAL_OPTIONS *options, BIO *hash, BIO *outdata) +{ + FILE_FORMAT_CTX *ctx; + SCRIPT_CTX *script_ctx; + const SCRIPT_FORMAT *fmt; + uint32_t filesize; + const uint8_t utf16_bom[] = {0xff, 0xfe}; + size_t name_len; + int utf; + + /* squash the unused parameter warning */ + (void)outdata; + + /* find out whether our format is supported */ + name_len = strlen(options->infile); + for (fmt = supported_formats; fmt->comment != comment_not_found; fmt++) { + size_t ext_len = strlen(fmt->extension); + if(name_len > ext_len && !strcasecmp(options->infile + name_len - ext_len, fmt->extension)) + break; + } + if (fmt->comment == comment_not_found) + return NULL; + printf("Script file format: %s\n", fmt->extension); + + filesize = get_file_size(options->infile); + if (filesize == 0) + return NULL; /* FAILED */ + + options->indata = map_file(options->infile, filesize); + if (!options->indata) { + return NULL; /* FAILED */ + } + utf = memcmp(options->indata, utf16_bom, sizeof utf16_bom) ? 8 : 16; + + /* initialize script context */ + script_ctx = script_ctx_get(options->indata, filesize, comment_text + fmt->comment, utf); + if (!script_ctx) { + unmap_file(options->indata, filesize); + return NULL; /* FAILED */ + } + + /* initialize file format context */ + ctx = OPENSSL_malloc(sizeof(FILE_FORMAT_CTX)); + memset(ctx, 0, sizeof(FILE_FORMAT_CTX)); + ctx->format = &file_format_script; + ctx->options = options; + ctx->script_ctx = script_ctx; + + if (hash) + BIO_push(hash, BIO_new(BIO_s_null())); + + /* FIXME: user interface logic belongs to osslsigncode.c */ + if (options->pagehash == 1) + printf("Warning: -ph option is only valid for PE files\n"); + if (options->jp >= 0) + printf("Warning: -jp option is only valid for CAB files\n"); + return ctx; +} + +/* + * Allocate and return SpcSipInfo object. + * Subject Interface Package (SIP) is an internal Microsoft API for + * transforming arbitrary files into a digestible stream. + * These ClassIDs are found in the indirect data section and identify + * the type of processor needed to validate the signature. + * https://github.com/sassoftware/relic/blob/620d0b75ec67c0158a8a9120950abe04327d922f/lib/authenticode/structs.go#L154 + * [out] p: SpcSipInfo data + * [out] plen: SpcSipInfo data length + * [in] ctx: structure holds input and output data + * [returns] pointer to ASN1_OBJECT structure corresponding to SPC_SIPINFO_OBJID + */ +static ASN1_OBJECT *script_spc_sip_info_get(u_char **p, int *plen, FILE_FORMAT_CTX *ctx) +{ + const u_char SpcUUIDSipInfoPs[] = { + 0x1f, 0xcc, 0x3b, 0x60, 0x59, 0x4b, 0x08, 0x4e, + 0xb7, 0x24, 0xd2, 0xc6, 0x29, 0x7e, 0xf3, 0x51 + }; + ASN1_OBJECT *dtype; + SpcSipInfo *si = SpcSipInfo_new(); + + /* squash the unused parameter warning */ + (void)ctx; + + ASN1_INTEGER_set(si->a, 65536); + ASN1_INTEGER_set(si->b, 0); + ASN1_INTEGER_set(si->c, 0); + ASN1_INTEGER_set(si->d, 0); + ASN1_INTEGER_set(si->e, 0); + ASN1_INTEGER_set(si->f, 0); + ASN1_OCTET_STRING_set(si->string, SpcUUIDSipInfoPs, sizeof SpcUUIDSipInfoPs); + *plen = i2d_SpcSipInfo(si, NULL); + *p = OPENSSL_malloc((size_t)*plen); + i2d_SpcSipInfo(si, p); + *p -= *plen; + dtype = OBJ_txt2obj(SPC_SIPINFO_OBJID, 1); + SpcSipInfo_free(si); + return dtype; /* OK */ +} + +/* + * Allocate and return a data content to be signed. + * [in] ctx: structure holds input and output data + * [in] hash: message digest BIO + * [in] md: message digest algorithm + * [returns] data content + */ +static PKCS7 *script_pkcs7_contents_get(FILE_FORMAT_CTX *ctx, BIO *hash, const EVP_MD *md) +{ + ASN1_OCTET_STRING *content; + BIO *bhash; + + /* squash the unused parameter warning */ + (void)hash; + + bhash = script_digest_calc_bio(ctx, md); + if (!bhash) { + return NULL; /* FAILED */ + } + content = spc_indirect_data_content_get(bhash, ctx); + BIO_free_all(bhash); + return pkcs7_set_content(content); +} + +static int script_hash_length_get(FILE_FORMAT_CTX *ctx) +{ + return EVP_MD_size(ctx->options->md); +} + +/* + * Check if the signature exists. + * FIXME: check it in pkcs7_extract() + * [in, out] ctx: structure holds input and output data + * [in] detached: embedded/detached PKCS#7 signature switch + * [returns] 0 on error or 1 on success + */ +static int script_check_file(FILE_FORMAT_CTX *ctx, int detached) +{ + if (!ctx) { + printf("Init error\n\n"); + return 0; /* FAILED */ + } + if (detached) { + printf("Checking the specified catalog file\n\n"); + return 1; /* OK */ + } + if (ctx->script_ctx->sigpos == 0 + || ctx->script_ctx->sigpos > ctx->script_ctx->fileend) { + printf("No signature found\n\n"); + return 0; /* FAILED */ + } + + return 1; /* OK */ +} + +/* + * Compute a simple sha1/sha256 message digest of the MSI file + * for use with a catalog file. + * [in] ctx: structure holds input and output data + * [in] md: message digest algorithm + * [returns] pointer to calculated message digest + */ +static u_char *script_digest_calc(FILE_FORMAT_CTX *ctx, const EVP_MD *md) +{ + u_char *mdbuf; + BIO *hash = BIO_new(BIO_f_md()); + + if (!BIO_set_md(hash, md)) { + printf("Unable to set the message digest of BIO\n"); + BIO_free_all(hash); + return NULL; /* FAILED */ + } + BIO_push(hash, BIO_new(BIO_s_null())); + if (!script_write_bio(hash, ctx->options->indata, ctx->script_ctx->fileend)) { + BIO_free_all(hash); + return NULL; /* FAILED */ + } + mdbuf = OPENSSL_malloc((size_t)EVP_MD_size(md)); + BIO_gets(hash, (char*)mdbuf, EVP_MD_size(md)); + BIO_free_all(hash); + return mdbuf; /* OK */ +} + +/* + * Calculate the hash and compare to PKCS#7 signedData. + * [in] ctx: structure holds input and output data + * [in] p7: PKCS#7 signature + * [returns] 0 on error or 1 on success + */ +static int script_verify_digests(FILE_FORMAT_CTX *ctx, PKCS7 *p7) +{ + int mdtype = -1; + u_char mdbuf[EVP_MAX_MD_SIZE]; + u_char *cmdbuf = NULL; + const EVP_MD *md; + BIO *bhash; + + /* FIXME: this shared code most likely belongs in osslsigncode.c */ + if (is_content_type(p7, SPC_INDIRECT_DATA_OBJID)) { + ASN1_STRING *content_val = p7->d.sign->contents->d.other->value.sequence; + const u_char *p = content_val->data; + SpcIndirectDataContent *idc = d2i_SpcIndirectDataContent(NULL, &p, content_val->length); + if (idc) { + if (idc->messageDigest && idc->messageDigest->digest && idc->messageDigest->digestAlgorithm) { + mdtype = OBJ_obj2nid(idc->messageDigest->digestAlgorithm->algorithm); + memcpy(mdbuf, idc->messageDigest->digest->data, (size_t)idc->messageDigest->digest->length); + } + SpcIndirectDataContent_free(idc); + } + } + if (mdtype == -1) { + printf("Failed to extract current message digest\n\n"); + return 0; /* FAILED */ + } + md = EVP_get_digestbynid(mdtype); + bhash = script_digest_calc_bio(ctx, md); + if (!bhash) + return 0; /* FAILED */ + + cmdbuf = OPENSSL_malloc((size_t)EVP_MD_size(md)); + BIO_gets(bhash, (char*)cmdbuf, EVP_MD_size(md)); + BIO_free_all(bhash); + + if (!compare_digests(mdbuf, cmdbuf, mdtype)) { + printf("Signature verification: failed\n\n"); + OPENSSL_free(cmdbuf); + return 0; /* FAILED */ + } + OPENSSL_free(cmdbuf); + return 1; /* OK */ +} + +/* + * Extract existing signature in DER format. + * [in] ctx: structure holds input and output data + * [returns] pointer to PKCS#7 structure + */ +static PKCS7 *script_pkcs7_extract(FILE_FORMAT_CTX *ctx) +{ + const char *signature_data = ctx->options->indata + ctx->script_ctx->sigpos; + size_t signature_len = ctx->script_ctx->fileend - ctx->script_ctx->sigpos; + size_t base64_len, der_max_length, der_length; + char *ptr; + BIO *bio_mem, *bio_b64 = NULL; + char *base64_data = NULL; + char *der_data = NULL; + const char *der_tmp; + char *clean_base64 = NULL; + int clean_base64_len = 0; + const char *open_tag = ctx->script_ctx->comment_text->open; + const char *close_tag = ctx->script_ctx->comment_text->close; + size_t open_tag_len = strlen(open_tag); + size_t close_tag_len = strlen(close_tag); + size_t signature_begin_len = strlen(signature_begin); + size_t signature_end_len = strlen(signature_end); + PKCS7 *retval = NULL; + + /* extract Base64 signature */ + if (ctx->script_ctx->utf == 8) { + base64_len = signature_len; + base64_data = OPENSSL_malloc(base64_len); + memcpy(base64_data, signature_data, base64_len); + } else { + base64_len = utf16_to_utf8((const void *)signature_data, + signature_len, &base64_data); + } + + /* allocate memory for cleaned Base64 */ + clean_base64 = OPENSSL_malloc(base64_len); + if (!clean_base64) { + printf("Malloc failed\n"); + goto cleanup; + } + + /* copy clean Base64 data */ + for (ptr = base64_data;;) { + /* find the opening tag */ + for(;;) { + if (ptr + open_tag_len >= base64_data + base64_len) { + printf("Signature line too long\n"); + goto cleanup; + } + if (!memcmp(ptr, open_tag, (size_t)open_tag_len)) { + ptr += open_tag_len; + break; + } + ptr++; + } + /* process signature_begin and signature_end */ + if (ptr + signature_begin_len < base64_data + base64_len && + !memcmp(ptr, signature_begin, signature_begin_len)) + ptr += signature_begin_len; + if (ptr + signature_end_len <= base64_data + base64_len && + !memcmp(ptr, signature_end, signature_end_len)) + break; /* success */ + + /* copy until the closing tag */ + for(;;) { + if (ptr + close_tag_len >= base64_data + base64_len) { + printf("Signature line too long\n"); + goto cleanup; + } + if (close_tag_len) { + if (!memcmp(ptr, close_tag, (size_t)close_tag_len)) { + ptr += close_tag_len; + break; + } + } + if (*ptr == '\r') { + ptr++; + } else if (*ptr == '\n') { + ptr++; + break; + } else { + clean_base64[clean_base64_len++] = *ptr++; + } + } + } + + /* prepare for Base64 decoding */ + bio_mem = BIO_new_mem_buf(clean_base64, clean_base64_len); + bio_b64 = BIO_new(BIO_f_base64()); + BIO_push(bio_b64, bio_mem); + BIO_set_flags(bio_b64, BIO_FLAGS_BASE64_NO_NL); + + /* allocate memory for DER output */ + der_max_length = BIO_ctrl_pending(bio_b64); + der_data = OPENSSL_malloc(der_max_length); + if (!der_data) + goto cleanup; + + /* decode Base64 to DER */ + if (!BIO_read_ex(bio_b64, der_data, der_max_length, &der_length)) + goto cleanup; + if (der_length <= 0) + goto cleanup; + + /* decode DER */ + der_tmp = der_data; + retval = d2i_PKCS7(NULL, (const unsigned char **)&der_tmp, (int)der_length); + +cleanup: + OPENSSL_free(base64_data); + OPENSSL_free(clean_base64); + OPENSSL_free(der_data); + BIO_free_all(bio_b64); + return retval; +} + +/* + * Extract existing signature in DER format. + * [in] ctx: structure holds input and output data + * [returns] pointer to PKCS#7 structure + */ +static PKCS7 *script_pkcs7_extract_to_nest(FILE_FORMAT_CTX *ctx) +{ + return script_pkcs7_extract(ctx); +} + +/* + * Remove existing signature. + * [in, out] ctx: structure holds input and output data + * [out] hash: message digest BIO + * [out] outdata: outdata file BIO + * [returns] 1 on error or 0 on success + */ +static int script_remove_pkcs7(FILE_FORMAT_CTX *ctx, BIO *hash, BIO *outdata) +{ + /* squash the unused parameter warning */ + (void)hash; + if (ctx->script_ctx->sigpos == 0 + || ctx->script_ctx->sigpos > ctx->script_ctx->fileend) { + return 1; /* FAILED, no signature */ + } + if (!script_write_bio(outdata, ctx->options->indata, ctx->script_ctx->sigpos)) { + return 1; /* FAILED */ + } + return 0; /* OK */ +} + +/* + * Initialize outdata file and calculate a hash (message digest) of data. + * [in, out] ctx: structure holds input and output data + * [out] hash: message digest BIO + * [out] outdata: outdata file BIO + * [returns] 1 on error or 0 on success + */ +static int script_process_data(FILE_FORMAT_CTX *ctx, BIO *hash, BIO *outdata) +{ + if (ctx->script_ctx->sigpos > 0) { + /* Strip current signature */ + ctx->script_ctx->fileend = ctx->script_ctx->sigpos; + } + if (!script_write_bio(outdata, ctx->options->indata, ctx->script_ctx->fileend)) + return 1; /* FAILED */ + if (!script_digest_convert(hash, ctx, ctx->script_ctx->fileend)) + return 1; /* FAILED */ + return 0; /* OK */ +} + +/* + * Create a new PKCS#7 signature. + * [in, out] ctx: structure holds input and output data + * [out] hash: message digest BIO + * [returns] pointer to PKCS#7 structure + */ +static PKCS7 *script_pkcs7_signature_new(FILE_FORMAT_CTX *ctx, BIO *hash) +{ + ASN1_OCTET_STRING *content; + PKCS7 *p7 = pkcs7_create(ctx); + + if (!p7) { + printf("Creating a new signature failed\n"); + return NULL; /* FAILED */ + } + if (!add_indirect_data_object(p7)) { + printf("Adding SPC_INDIRECT_DATA_OBJID failed\n"); + PKCS7_free(p7); + return NULL; /* FAILED */ + } + content = spc_indirect_data_content_get(hash, ctx); + if (!content) { + printf("Failed to get spcIndirectDataContent\n"); + return NULL; /* FAILED */ + } + if (!sign_spc_indirect_data_content(p7, content)) { + printf("Failed to set signed content\n"); + PKCS7_free(p7); + ASN1_OCTET_STRING_free(content); + return NULL; /* FAILED */ + } + ASN1_OCTET_STRING_free(content); + return p7; +} + +/* + * Append signature to the outfile. + * [in, out] ctx: structure holds input and output data + * [out] outdata: outdata file BIO + * [in] p7: PKCS#7 signature + * [returns] 1 on error or 0 on success + */ +static int script_append_pkcs7(FILE_FORMAT_CTX *ctx, BIO *outdata, PKCS7 *p7) +{ + BIO *bio, *b64; + BUF_MEM *buffer; + size_t i; + static const char crlf[] = {0x0d, 0x0a}; + + /* convert to BASE64 */ + b64 = BIO_new(BIO_f_base64()); /* BIO for base64 encoding */ + if (!b64) + return 1; /* FAILED */ + BIO_set_flags(b64, BIO_FLAGS_BASE64_NO_NL); + bio = BIO_new(BIO_s_mem()); /* BIO to hold the base64 data */ + if (!bio) { + BIO_free(b64); + return 1; /* FAILED */ + } + bio = BIO_push(b64, bio); /* chain base64 BIO onto memory BIO */ + if (!i2d_PKCS7_bio(bio, p7)) { + BIO_free_all(bio); + return 1; /* FAILED */ + } + (void)BIO_flush(bio); + BIO_get_mem_ptr(bio, &buffer); + (void)BIO_set_close(bio, BIO_NOCLOSE); + + /* split to individual lines and write to outdata */ + write_commented(ctx, outdata, signature_begin, strlen(signature_begin)); + for (i = 0; i < buffer->length; i += 64) { + write_commented(ctx, outdata, buffer->data + i, + buffer->length - i < 64 ? buffer->length - i : 64); + } + write_commented(ctx, outdata, signature_end, strlen(signature_end)); + + /* signtool expects CRLF terminator at the end of the text file */ + write_in_encoding(ctx, outdata, crlf, sizeof crlf); + + BUF_MEM_free(buffer); + BIO_free_all(bio); + + return 0; /* OK */ +} + +/* + * Free up an entire outdata BIO chain. + * [out] hash: message digest BIO + * [out] outdata: outdata file BIO + * [returns] none + */ +static BIO *script_bio_free(BIO *hash, BIO *outdata) +{ + BIO_free_all(hash); + BIO_free_all(outdata); + /* FIXME: why doesn't the function return void instead of BIO *? */ + return NULL; +} + +/* + * Deallocate a FILE_FORMAT_CTX structure and script format specific structures. + * [in, out] ctx: structure holds input and output data + * [out] hash: message digest BIO + * [out] outdata: outdata file BIO + * [returns] none + */ +static void script_ctx_cleanup(FILE_FORMAT_CTX *ctx, BIO *hash, BIO *outdata) +{ + if (outdata) { + BIO_free_all(hash); + BIO_free_all(outdata); + } + unmap_file(ctx->options->indata, ctx->script_ctx->fileend); + OPENSSL_free(ctx->script_ctx); + OPENSSL_free(ctx); +} + +/* + * Script helper functions + */ + +static SCRIPT_CTX *script_ctx_get(char *indata, uint32_t filesize, const SCRIPT_COMMENT *comment, int utf) +{ + SCRIPT_CTX *script_ctx; + + const char *input_pos, *signature_pos, *ptr; + uint32_t line[LINE_MAX_LEN], sig_start[40], cr, lf; + size_t sig_pos = 0, line_pos = 0, sig_start_pos = 0; + + utf8DecodeRune("\r", 1, &cr); + utf8DecodeRune("\n", 1, &lf); + + /* compute runes for the beginning of the signature */ + for (ptr = comment->open; *ptr; sig_start_pos++) + ptr = utf8DecodeRune(ptr, 1, sig_start + sig_start_pos); + for (ptr = signature_begin; *ptr; sig_start_pos++) + ptr = utf8DecodeRune(ptr, 1, sig_start + sig_start_pos); + for (ptr = comment->close; *ptr; sig_start_pos++) + ptr = utf8DecodeRune(ptr, 1, sig_start + sig_start_pos); + + /* find the beginning of the signature */ + for (signature_pos = input_pos = indata; input_pos < indata + filesize; ) { + const char *input_prev = input_pos; + + input_pos = utf == 8 ? + utf8DecodeRune(input_pos, + (size_t)(indata + filesize - input_pos), + line + line_pos) : + (const char *)utf16DecodeRune((const void *)input_pos, + (size_t)(indata + filesize - input_pos)/2, + line + line_pos); + + if (!memcmp(line + line_pos, &lf, sizeof lf)) { + if (line_pos >= sig_start_pos && + !memcmp(line, sig_start, sig_start_pos * sizeof(uint32_t))) { + sig_pos = (size_t)(signature_pos - indata); + if (!memcmp(line + line_pos - 1, &cr, sizeof cr)) + sig_pos -= (size_t)utf / 8; + break; /* SUCCEEDED */ + } + line_pos = 0; + signature_pos = input_prev; /* previous line */ + } else if (line_pos < LINE_MAX_LEN - 1) { + line_pos++; /* we can ignore lines longer than our buffer */ + } + } + printf("Signature position: %ld\n", sig_pos); + + script_ctx = OPENSSL_malloc(sizeof(SCRIPT_CTX)); + script_ctx->comment_text = comment; + script_ctx->utf = utf; + script_ctx->fileend = filesize; + script_ctx->sigpos = (uint32_t)sig_pos; + return script_ctx; /* OK */ +} + +/* write a commented line to the bio: + * - prepend with CRLF ("\r\n") + * - add opening/closing comment tags + * - adjust encoding if needed */ +static void write_commented(FILE_FORMAT_CTX *ctx, BIO *outdata, const char *data, size_t length) +{ + const char *open_tag = ctx->script_ctx->comment_text->open; + const char *close_tag = ctx->script_ctx->comment_text->close; + size_t open_tag_len = strlen(open_tag); + size_t close_tag_len = strlen(close_tag); + char *line; + + /* the buffer needs to be long enough for: + * - CRLF ("\r\n") + * - opening tag + * - up to 64 bytes of data + * - closing tag + * - trailing NUL ("\0") */ + line = OPENSSL_malloc(2 + open_tag_len + length + close_tag_len + 1); + strcpy(line, "\r\n"); + strcat(line, open_tag); + memcpy(line + 2 + open_tag_len, data, length); + line[2 + open_tag_len + length] = '\0'; + strcat(line, close_tag); + + /* adjust encoding */ + write_in_encoding(ctx, outdata, line, strlen(line)); + OPENSSL_free(line); +} + +/* adjust encoding if needed */ +static void write_in_encoding(FILE_FORMAT_CTX *ctx, BIO *outdata, const char *line, size_t length) +{ + size_t written; + if (ctx->script_ctx->utf == 8) { + BIO_write_ex(outdata, line, length, &written); + } else { + uint16_t *utf16_data = NULL; + size_t utf16_len = utf8_to_utf16(line, length, &utf16_data); + + BIO_write_ex(outdata, utf16_data, utf16_len, &written); + OPENSSL_free(utf16_data); + } +} + +/* convert len bytes of UTF-8 to UTF-16 + * return the number of output bytes + */ +static size_t utf8_to_utf16(const char *data, size_t len, uint16_t **out_utf16) +{ + size_t utf16_len = utf8UTF16Count(data, len); + *out_utf16 = OPENSSL_malloc(utf16_len * sizeof(uint16_t)); + if (!*out_utf16) + return 0; /* memory allocation failed */ + + const char *s = data; + uint16_t *d = *out_utf16; + uint32_t rune; + size_t remaining_len = len; + + while (remaining_len > 0) { + s = utf8DecodeRune(s, remaining_len, &rune); + if (!s || s < data) + break; /* invalid UTF-8 sequence */ + size_t consumed = (size_t)(s - data); + + remaining_len -= consumed; + data = s; + d += utf16EncodeRune(rune, d); + } + return (size_t)(2 * (d - *out_utf16)); +} + +/* convert len bytes of UTF-16 to UTF-8 + * return the number of output bytes + */ +static size_t utf16_to_utf8(const uint16_t *data, size_t len, char **out_utf8) +{ + size_t utf8_len = utf16UTF8Count(data, len/2); + *out_utf8 = OPENSSL_malloc(utf8_len); + if (!*out_utf8) + return 0; /* memory allocation failed */ + + const uint16_t *s = data; + char *d = *out_utf8; + uint32_t rune; + size_t remaining_len = len/2; + + while (remaining_len > 0) { + s = utf16DecodeRune(s, remaining_len, &rune); + if (!s || s < data) + break; /* invalid UTF-16 sequence */ + size_t consumed = (size_t)(s - data); + + remaining_len -= consumed; + data = s; + d += utf8EncodeRune(rune, d); + } + return (size_t)(d - *out_utf8); +} + +/* + * Compute a message digest value of a signed or unsigned script file. + * [in] ctx: structure holds input and output data + * [in] md: message digest algorithm + * [returns] calculated message digest BIO + */ +static BIO *script_digest_calc_bio(FILE_FORMAT_CTX *ctx, const EVP_MD *md) +{ + size_t fileend; + BIO *hash = BIO_new(BIO_f_md()); + + if (ctx->script_ctx->sigpos) + fileend = ctx->script_ctx->sigpos; + else + fileend = ctx->script_ctx->fileend; + + if (!BIO_set_md(hash, md)) { + printf("Unable to set the message digest of BIO\n"); + BIO_free_all(hash); + return NULL; /* FAILED */ + } + BIO_push(hash, BIO_new(BIO_s_null())); + if (!script_digest_convert(hash, ctx, fileend)) { + printf("Unable calc a message digest value\n"); + BIO_free_all(hash); + return NULL; /* FAILED */ + } + return hash; +} + +/* + * Compute a message digest value + * [in, out] hash: message digest BIO + * [in] ctx: structure holds input and output data + * [in] len: mapped file length + * [returns] 0 on error or 1 on success + */ +static int script_digest_convert(BIO *hash, FILE_FORMAT_CTX *ctx, size_t len) +{ + if (ctx->script_ctx->utf == 8) { /* need to convert to UTF-16 */ + uint16_t *utf16_data = NULL; + size_t utf16_len = utf8_to_utf16(ctx->options->indata, + len, &utf16_data); + + if (!script_write_bio(hash, (char *)utf16_data, utf16_len)) { + OPENSSL_free(utf16_data); + return 0; /* FAILED */ + } + OPENSSL_free(utf16_data); + } else { /* already UTF-16 -> no need to convert */ + if (!script_write_bio(hash, ctx->options->indata, len)) { + return 0; /* FAILED */ + } + } + return 1; /* OK */ +} + +/* + * Write len bytes from data to BIO + * [in, out] bio: message digest or outdata BIO + * [in] indata: mapped file + * [in] len: indata length + * [returns] 0 on error or 1 on success + */ +static int script_write_bio(BIO *bio, char *indata, size_t len) +{ + size_t i = 0, written; + + while (len > 0) { + if (!BIO_write_ex(bio, indata + i, len, &written)) + return 0; /* FAILED */ + len -= written; + i += written; + } + return 1; /* OK */ +} + +/* +Local Variables: + c-basic-offset: 4 + tab-width: 4 + indent-tabs-mode: nil +End: + + vim: set ts=4 expandtab: +*/ diff --git a/tests/files/unsigned.256appx b/tests/files/unsigned.256appx old mode 100755 new mode 100644 diff --git a/tests/files/unsigned.512appx b/tests/files/unsigned.512appx old mode 100755 new mode 100644 diff --git a/tests/files/unsigned.cat b/tests/files/unsigned.cat index 8160bb7..73ad5ae 100644 Binary files a/tests/files/unsigned.cat and b/tests/files/unsigned.cat differ diff --git a/tests/files/unsigned.mof b/tests/files/unsigned.mof new file mode 100644 index 0000000..af36eeb Binary files /dev/null and b/tests/files/unsigned.mof differ diff --git a/tests/files/unsigned.ps1 b/tests/files/unsigned.ps1 new file mode 100644 index 0000000..85586cd --- /dev/null +++ b/tests/files/unsigned.ps1 @@ -0,0 +1,2 @@ +cls +Write-Host "żółć" \ No newline at end of file diff --git a/tests/files/unsigned.psc1 b/tests/files/unsigned.psc1 new file mode 100644 index 0000000..35d40c9 --- /dev/null +++ b/tests/files/unsigned.psc1 @@ -0,0 +1,5 @@ + + + 5.1.19041.3930 + + \ No newline at end of file diff --git a/tests/sources/CatalogDefinitionFileName.cdf b/tests/sources/CatalogDefinitionFileName.cdf index 4310f58..204dba4 100644 --- a/tests/sources/CatalogDefinitionFileName.cdf +++ b/tests/sources/CatalogDefinitionFileName.cdf @@ -46,3 +46,12 @@ CATATTR1=0x11010001:OSAttr:2:6.0 CABfile=..\files\unsigned.ex_ CABfileATTR1=0x11010001:File:unsigned.ex_ + +PS1file=..\files\unsigned.ps1 +PS1fileATTR1=0x11010001:File:unsigned.ps1 + +PSC1file=..\files\unsigned.psc1 +PSC1fileATTR1=0x11010001:File:unsigned.psc1 + +MOFfile=..\files\unsigned.mof +MOFfileATTR1=0x11010001:File:unsigned.mof diff --git a/utf.c b/utf.c new file mode 100644 index 0000000..25fd88b --- /dev/null +++ b/utf.c @@ -0,0 +1,347 @@ +// utf by pietro gagliardi (andlabs) — https://github.com/andlabs/utf/ +// 10 november 2016 +#include "utf.h" + +// this code imitates Go's unicode/utf8 and unicode/utf16 +// the biggest difference is that a rune is unsigned instead of signed (because Go guarantees what a right shift on a signed number will do, whereas C does not) +// it is also an imitation so we can license it under looser terms than the Go source +#define badrune 0xFFFD + +// encoded must be at most 4 bytes +// TODO clean this code up somehow +size_t utf8EncodeRune(uint32_t rune, char *encoded) +{ + uint8_t b, c, d, e; + size_t n; + + // not in the valid range for Unicode + if (rune > 0x10FFFF) + rune = badrune; + // surrogate runes cannot be encoded + if (rune >= 0xD800 && rune < 0xE000) + rune = badrune; + + if (rune < 0x80) { // ASCII bytes represent themselves + b = (uint8_t) (rune & 0xFF); + n = 1; + goto done; + } + if (rune < 0x800) { // two-byte encoding + c = (uint8_t) (rune & 0x3F); + c |= 0x80; + rune >>= 6; + b = (uint8_t) (rune & 0x1F); + b |= 0xC0; + n = 2; + goto done; + } + if (rune < 0x10000) { // three-byte encoding + d = (uint8_t) (rune & 0x3F); + d |= 0x80; + rune >>= 6; + c = (uint8_t) (rune & 0x3F); + c |= 0x80; + rune >>= 6; + b = (uint8_t) (rune & 0x0F); + b |= 0xE0; + n = 3; + goto done; + } + // otherwise use a four-byte encoding + e = (uint8_t) (rune & 0x3F); + e |= 0x80; + rune >>= 6; + d = (uint8_t) (rune & 0x3F); + d |= 0x80; + rune >>= 6; + c = (uint8_t) (rune & 0x3F); + c |= 0x80; + rune >>= 6; + b = (uint8_t) (rune & 0x07); + b |= 0xF0; + n = 4; + +done: + encoded[0] = (char)b; + if (n > 1) + encoded[1] = (char)c; + if (n > 2) + encoded[2] = (char)d; + if (n > 3) + encoded[3] = (char)e; + return n; +} + +const char *utf8DecodeRune(const char *s, size_t nElem, uint32_t *rune) +{ + uint8_t b, c; + uint8_t lowestAllowed, highestAllowed; + size_t i, expected; + int bad; + + b = (uint8_t) (*s); + if (b < 0x80) { // ASCII bytes represent themselves + *rune = b; + s++; + return s; + } + // 0xC0 and 0xC1 cover 2-byte overlong equivalents + // 0xF5 to 0xFD cover values > 0x10FFFF + // 0xFE and 0xFF were never defined (always illegal) + if (b < 0xC2 || b > 0xF4) { // invalid + *rune = badrune; + s++; + return s; + } + + // this determines the range of allowed first continuation bytes + lowestAllowed = 0x80; + highestAllowed = 0xBF; + switch (b) { + case 0xE0: + // disallow 3-byte overlong equivalents + lowestAllowed = 0xA0; + break; + case 0xED: + // disallow surrogate characters + highestAllowed = 0x9F; + break; + case 0xF0: + // disallow 4-byte overlong equivalents + lowestAllowed = 0x90; + break; + case 0xF4: + // disallow values > 0x10FFFF + highestAllowed = 0x8F; + break; + } + + // and this determines how many continuation bytes are expected + expected = 1; + if (b >= 0xE0) + expected++; + if (b >= 0xF0) + expected++; + if (nElem != 0) { // are there enough bytes? + nElem--; + if (nElem < expected) { // nope + *rune = badrune; + s++; + return s; + } + } + + // ensure that everything is correct + // if not, **only** consume the initial byte + bad = 0; + for (i = 0; i < expected; i++) { + c = (uint8_t) (s[1 + i]); + if (c < lowestAllowed || c > highestAllowed) { + bad = 1; + break; + } + // the old lowestAllowed and highestAllowed is only for the first continuation byte + lowestAllowed = 0x80; + highestAllowed = 0xBF; + } + if (bad) { + *rune = badrune; + s++; + return s; + } + + // now do the topmost bits + if (b < 0xE0) + *rune = b & 0x1F; + else if (b < 0xF0) + *rune = b & 0x0F; + else + *rune = b & 0x07; + s++; // we can finally move on + + // now do the continuation bytes + for (; expected; expected--) { + c = (uint8_t) (*s); + s++; + c &= 0x3F; // strip continuation bits + *rune <<= 6; + *rune |= c; + } + + return s; +} + +// encoded must have at most 2 elements +size_t utf16EncodeRune(uint32_t rune, uint16_t *encoded) +{ + uint16_t low, high; + + // not in the valid range for Unicode + if (rune > 0x10FFFF) + rune = badrune; + // surrogate runes cannot be encoded + if (rune >= 0xD800 && rune < 0xE000) + rune = badrune; + + if (rune < 0x10000) { + encoded[0] = (uint16_t) rune; + return 1; + } + + rune -= 0x10000; + low = (uint16_t) (rune & 0x3FF); + rune >>= 10; + high = (uint16_t) (rune & 0x3FF); + encoded[0] = high | 0xD800; + encoded[1] = low | 0xDC00; + return 2; +} + +// TODO see if this can be cleaned up somehow +const uint16_t *utf16DecodeRune(const uint16_t *s, size_t nElem, uint32_t *rune) +{ + uint16_t high, low; + + if (*s < 0xD800 || *s >= 0xE000) { + // self-representing character + *rune = *s; + s++; + return s; + } + if (*s >= 0xDC00) { + // out-of-order surrogates + *rune = badrune; + s++; + return s; + } + if (nElem == 1) { // not enough elements + *rune = badrune; + s++; + return s; + } + high = *s; + high &= 0x3FF; + if (s[1] < 0xDC00 || s[1] >= 0xE000) { + // bad surrogate pair + *rune = badrune; + s++; + return s; + } + s++; + low = *s; + s++; + low &= 0x3FF; + *rune = high; + *rune <<= 10; + *rune |= low; + *rune += 0x10000; + return s; +} + +// TODO find a way to reduce the code in all of these somehow +// TODO find a way to remove u as well +size_t utf8RuneCount(const char *s, size_t nElem) +{ + size_t len; + uint32_t rune; + + if (nElem != 0) { + const char *t, *u; + + len = 0; + t = s; + while (nElem != 0) { + u = utf8DecodeRune(t, nElem, &rune); + len++; + nElem -= (size_t)(u - t); + t = u; + } + return len; + } + len = 0; + while (*s) { + s = utf8DecodeRune(s, nElem, &rune); + len++; + } + return len; +} + +size_t utf8UTF16Count(const char *s, size_t nElem) +{ + size_t len; + uint32_t rune; + uint16_t encoded[2]; + + if (nElem != 0) { + const char *t, *u; + + len = 0; + t = s; + while (nElem != 0) { + u = utf8DecodeRune(t, nElem, &rune); + len += utf16EncodeRune(rune, encoded); + nElem -= (size_t)(u - t); + t = u; + } + return len; + } + len = 0; + while (*s) { + s = utf8DecodeRune(s, nElem, &rune); + len += utf16EncodeRune(rune, encoded); + } + return len; +} + +size_t utf16RuneCount(const uint16_t *s, size_t nElem) +{ + size_t len; + uint32_t rune; + + if (nElem != 0) { + const uint16_t *t, *u; + + len = 0; + t = s; + while (nElem != 0) { + u = utf16DecodeRune(t, nElem, &rune); + len++; + nElem -= (size_t)(u - t); + t = u; + } + return len; + } + len = 0; + while (*s) { + s = utf16DecodeRune(s, nElem, &rune); + len++; + } + return len; +} + +size_t utf16UTF8Count(const uint16_t *s, size_t nElem) +{ + size_t len; + uint32_t rune; + char encoded[4]; + + if (nElem != 0) { + const uint16_t *t, *u; + + len = 0; + t = s; + while (nElem != 0) { + u = utf16DecodeRune(t, nElem, &rune); + len += utf8EncodeRune(rune, encoded); + nElem -= (size_t)(u - t); + t = u; + } + return len; + } + len = 0; + while (*s) { + s = utf16DecodeRune(s, nElem, &rune); + len += utf8EncodeRune(rune, encoded); + } + return len; +} diff --git a/utf.h b/utf.h new file mode 100644 index 0000000..b810a49 --- /dev/null +++ b/utf.h @@ -0,0 +1,61 @@ +// utf by pietro gagliardi (andlabs) — https://github.com/andlabs/utf/ +// 10 november 2016 + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +// if nElem == 0, assume the buffer has no upper limit and is '\0' terminated +// otherwise, assume buffer is NOT '\0' terminated but is bounded by nElem *elements* + +extern size_t utf8EncodeRune(uint32_t rune, char *encoded); +extern const char *utf8DecodeRune(const char *s, size_t nElem, uint32_t *rune); +extern size_t utf16EncodeRune(uint32_t rune, uint16_t *encoded); +extern const uint16_t *utf16DecodeRune(const uint16_t *s, size_t nElem, uint32_t *rune); + +extern size_t utf8RuneCount(const char *s, size_t nElem); +extern size_t utf8UTF16Count(const char *s, size_t nElem); +extern size_t utf16RuneCount(const uint16_t *s, size_t nElem); +extern size_t utf16UTF8Count(const uint16_t *s, size_t nElem); + +#ifdef __cplusplus +} + +// Provide overloads on Windows for using these functions with wchar_t and WCHAR when wchar_t is a keyword in C++ mode (the default). +// Otherwise, you'd need to cast to pass a wchar_t pointer, WCHAR pointer, or equivalent to these functions. +// We use __wchar_t to be independent of the setting; see https://blogs.msdn.microsoft.com/oldnewthing/20161201-00/?p=94836 (ironically posted one day after I initially wrote this code!). +// TODO check this on MinGW-w64 +// TODO check this under /Wall +// TODO C-style casts enough? or will that fail in /Wall? +// TODO same for UniChar/unichar on Mac? if both are unsigned then we have nothing to worry about +#if defined(_MSC_VER) + +inline size_t utf16EncodeRune(uint32_t rune, __wchar_t *encoded) +{ + return utf16EncodeRune(rune, reinterpret_cast(encoded)); +} + +inline const __wchar_t *utf16DecodeRune(const __wchar_t *s, size_t nElem, uint32_t *rune) +{ + const uint16_t *ret; + + ret = utf16DecodeRune(reinterpret_cast(s), nElem, rune); + return reinterpret_cast(ret); +} + +inline size_t utf16RuneCount(const __wchar_t *s, size_t nElem) +{ + return utf16RuneCount(reinterpret_cast(s), nElem); +} + +inline size_t utf16UTF8Count(const __wchar_t *s, size_t nElem) +{ + return utf16UTF8Count(reinterpret_cast(s), nElem); +} + +#endif + +#endif