Made the decoder a filter by avoiding one copy pass.

author: jcid <devnull@localhost> 2007-11-27 00:24:52 +0100
committer: jcid <devnull@localhost> 2007-11-27 00:24:52 +0100
commit: b0358c3dea7a0f34dc494d82be08ae814a480a50 (patch)
tree: 264e3369e8a55e7c41131e5519c7eee8cb8656f0 /src
parent: 1f8d6679cf90bd66f514c8a9b1d5536358ca28d3 (diff)
3 files changed, 68 insertions, 46 deletions
diff --git a/src/cache.c b/src/cache.c
index 4f7f074c..448ce3db 100644
--- a/src/cache.c
+++ b/src/cache.c
@@ -366,7 +366,7 @@ int a_Cache_get_buf(const DilloUrl *Url, char **PBuf, int *BufSize)
 
       /* Test for a redirection loop */
       if (entry->Flags & CA_RedirectLoop || i == 3) {
-         MSG_WARN("Redirect loop for URL: >%s<\n", URL_STR_(Url));
+         _MSG_WARN("Redirect loop for URL: >%s<\n", URL_STR_(Url));
          break;
       }
       /* Test for a working redirection */
@@ -461,7 +461,7 @@ static void Cache_parse_header(CacheEntry_t *entry,
 {
    char *header = entry->Header->str;
    char *Length, *Type, *location_str, *encoding;
-   Dstr *decodedBuf;
+   Dstr *dbuf;
 #ifndef DISABLE_COOKIES
    Dlist *Cookies;
    void *data;
@@ -512,8 +512,10 @@ static void Cache_parse_header(CacheEntry_t *entry,
    entry->Decoder = a_Decode_content_init(encoding);
    dFree(encoding);
 
-   decodedBuf = a_Decode_process(entry->Decoder, buf + HdrLen,
-                                 buf_size - HdrLen);
+   dbuf = dStr_sized_new(buf_size - HdrLen);
+   dStr_append_l(dbuf, buf + HdrLen, buf_size - HdrLen);
+
+   dbuf = a_Decode_process(entry->Decoder, dbuf);
 
    if (entry->ExpectedSize > 0) {
       if (entry->ExpectedSize > HUGE_FILESIZE) {
@@ -525,8 +527,8 @@ static void Cache_parse_header(CacheEntry_t *entry,
       dStr_free(entry->Data, 1);
       entry->Data = dStr_sized_new(MIN(entry->ExpectedSize+1, MAX_INIT_BUF));
    }
-   dStr_append_l(entry->Data, decodedBuf->str, decodedBuf->len);
-   dStr_free(decodedBuf, 1);
+   dStr_append_l(entry->Data, dbuf->str, dbuf->len);
+   dStr_free(dbuf, 1);
 
    /* Get Content-Type */
    if ((Type = Cache_parse_field(header, "Content-Type")) == NULL) {
@@ -582,7 +584,7 @@ void a_Cache_process_dbuf(int Op, const char *buf, size_t buf_size,
 {
    int len;
    CacheEntry_t *entry = Cache_entry_search(Url);
-   Dstr *decodedBuf;
+   Dstr *dbuf;
 
    /* Assert a valid entry (not aborted) */
    dReturn_if_fail (entry != NULL);
@@ -625,16 +627,18 @@ void a_Cache_process_dbuf(int Op, const char *buf, size_t buf_size,
 
    entry->TransferSize += buf_size;
 
+   dbuf = dStr_sized_new(buf_size);
+   dStr_append_l(dbuf, buf, buf_size);
+
    /* Assert we have a Decoder.
     * BUG: this is a workaround, more study and a proper design
     * for handling redirects is required */
    if (entry->Decoder != NULL) {
-      decodedBuf = a_Decode_process(entry->Decoder, buf, buf_size);
-      dStr_append_l(entry->Data, decodedBuf->str, decodedBuf->len);
-      dStr_free(decodedBuf, 1);
-   } else {
-      dStr_append_l(entry->Data, buf, buf_size);
+      dbuf = a_Decode_process(entry->Decoder, dbuf);
    }
+   dStr_append_l(entry->Data, dbuf->str, dbuf->len);
+   dStr_free(dbuf, 1);
+
    Cache_process_queue(entry);
 }
 
diff --git a/src/decode.c b/src/decode.c
index 7ae9bfa4..30000e05 100644
--- a/src/decode.c
+++ b/src/decode.c
@@ -7,22 +7,24 @@
 #include "msg.h"
 
 
-const int bufsize = 8*1024;
+static const int bufsize = 8*1024;
 
-
-static Dstr *Decode_null(Decode *dc, const char *inData, int inLen)
+/*
+ * null ("identity") decoding
+ */
+static Dstr *Decode_null(Decode *dc, Dstr *input)
 {
-   Dstr *d = dStr_new("");
-   dStr_append_l(d, inData, inLen);
-   return d;
+   return input;
 }
 
 static void Decode_null_free(Decode *dc)
 {
 }
 
-
-static Dstr *Decode_gzip(Decode *dc, const char *inData, int inLen)
+/*
+ * Decode gzipped data
+ */
+static Dstr *Decode_gzip(Decode *dc, Dstr *input)
 {
    int rc = Z_OK;
 
@@ -31,9 +33,9 @@ static Dstr *Decode_gzip(Decode *dc, const char *inData, int inLen)
    int inputConsumed = 0;
    Dstr *output = dStr_new("");
 
-   while ((rc == Z_OK) && (inputConsumed < inLen)) {
-      zs->next_in = (Bytef *)inData + inputConsumed;
-      zs->avail_in = inLen - inputConsumed;
+   while ((rc == Z_OK) && (inputConsumed < input->len)) {
+      zs->next_in = (Bytef *)input->str + inputConsumed;
+      zs->avail_in = input->len - inputConsumed;
 
       zs->next_out = (Bytef *)dc->buffer;
       zs->avail_out = bufsize;
@@ -52,6 +54,7 @@ static Dstr *Decode_gzip(Decode *dc, const char *inData, int inLen)
       }
    }
 
+   dStr_free(input, 1);
    return output;
 }
 
@@ -62,19 +65,23 @@ static void Decode_gzip_free(Decode *dc)
    dFree(dc->buffer);
 }
 
-
-static Dstr *Decode_charset(Decode *dc, const char *inData, int inLen)
+/*
+ * Translate to desired character set (UTF-8)
+ */
+static Dstr *Decode_charset(Decode *dc, Dstr *input)
 {
    int rc = 0;
 
-   Dstr *input, *output;
+   Dstr *output;
    char *inPtr, *outPtr;
    size_t inLeft, outRoom;
 
    output = dStr_new("");
 
+   dStr_append_l(dc->leftover, input->str, input->len);
+   dStr_free(input, 1);
    input = dc->leftover;
-   dStr_append_l(input, inData, inLen);
+
    inPtr = input->str;
    inLeft = input->len;
 
@@ -103,6 +110,7 @@ static Dstr *Decode_charset(Decode *dc, const char *inData, int inLen)
           *        unknown or unrepresentable in Unicode."
           */
           //dStr_append(output, "\ufffd");
+          // \uxxxx is C99. UTF-8-specific:
           dStr_append_c(output, 0xEF);
           dStr_append_c(output, 0xBF);
           dStr_append_c(output, 0xBD);
@@ -123,7 +131,12 @@ static void Decode_charset_free(Decode *dc)
    dStr_free(dc->leftover, 1);
 }
 
-
+/*
+ * Initialize content decoder. Currently handles gzip.
+ *
+ * zlib is also capable of handling "deflate"/zlib-encoded data, but web
+ * servers have not standardized on whether to send such data with a header.
+ */
 Decode *a_Decode_content_init(const char *format)
 {
    Decode *dc = (Decode *)dMalloc(sizeof(Decode));
@@ -158,19 +171,9 @@ Decode *a_Decode_content_init(const char *format)
    return dc;      
 }
 
-static int Decode_is_latin1(const char *str)
-{
-   return (!(dStrcasecmp(str, "ISO-8859-1") ||
-             dStrcasecmp(str, "latin1") ||
-             dStrcasecmp(str, "ISO_8859-1:1987") ||
-             dStrcasecmp(str, "ISO_8859-1") ||
-             dStrcasecmp(str, "iso-ir-100") ||
-             dStrcasecmp(str, "l1") ||
-             dStrcasecmp(str, "IBM819") ||
-             dStrcasecmp(str, "CP819") ||
-             dStrcasecmp(str, "csISOLatin1")));
-}
-
+/*
+ * Legal names for the ASCII character set
+ */
 static int Decode_is_ascii(const char *str)
 {
    return (!(dStrcasecmp(str, "ASCII") ||
@@ -186,6 +189,13 @@ static int Decode_is_ascii(const char *str)
              dStrcasecmp(str, "ISO646-US")));
 }
 
+/*
+ * Initialize decoder to translate from any character set known to iconv()
+ * to UTF-8.
+ *
+ * GNU iconv(1) will provide a list of known character sets if invoked with
+ * the "--list" flag.
+ */
 Decode *a_Decode_charset_init(const char *format)
 {
    Decode *dc = (Decode *)dMalloc(sizeof(Decode));
@@ -193,7 +203,6 @@ Decode *a_Decode_charset_init(const char *format)
    if (format &&
        strlen(format) &&
        dStrcasecmp(format,"UTF-8") &&
-       !Decode_is_latin1(format) &&
        !Decode_is_ascii(format)) {
 
       iconv_t ic;
@@ -216,11 +225,20 @@ Decode *a_Decode_charset_init(const char *format)
    return dc;      
 }
 
-Dstr *a_Decode_process(Decode *dc, const char *inData, int inLen)
+/*
+ * Filter data using our decoder.
+ *
+ * The input string should not be used after this call. The decoder will
+ * free it if necessary.
+ */
+Dstr *a_Decode_process(Decode *dc, Dstr *input)
 {
-   return dc->decode(dc, inData, inLen);
+   return dc->decode(dc, input);
 }
 
+/*
+ * free our decoder
+ */
 void a_Decode_free(Decode *dc)
 {
    dc->free(dc);
diff --git a/src/decode.h b/src/decode.h
index 2cdc29e7..7f8045cd 100644
--- a/src/decode.h
+++ b/src/decode.h
@@ -13,13 +13,13 @@ struct _Decode {
    char *buffer;
    Dstr *leftover;
    void *state;
-   Dstr *(*decode) (Decode *dc, const char *inData, int inLen);
+   Dstr *(*decode) (Decode *dc, Dstr *input);
    void (*free) (Decode *dc);
 };
 
 Decode *a_Decode_content_init(const char *format);
 Decode *a_Decode_charset_init(const char *format);
-Dstr *a_Decode_process(Decode *dc, const char *inData, int inLen);
+Dstr *a_Decode_process(Decode *dc, Dstr *input);
 void a_Decode_free(Decode *dc);
 
 #ifdef __cplusplus
author	jcid <devnull@localhost>	2007-11-27 00:24:52 +0100
committer	jcid <devnull@localhost>	2007-11-27 00:24:52 +0100
commit	b0358c3dea7a0f34dc494d82be08ae814a480a50 (patch)
tree	264e3369e8a55e7c41131e5519c7eee8cb8656f0 /src
parent	1f8d6679cf90bd66f514c8a9b1d5536358ca28d3 (diff)