// Copyright 2011 The Snappy-Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package snappy import ( "encoding/binary" "errors" "io" ) // Encode returns the encoded form of src. The returned slice may be a sub- // slice of dst if dst was large enough to hold the entire encoded block. // Otherwise, a newly allocated slice will be returned. // // The dst and src must not overlap. It is valid to pass a nil dst. // // Encode handles the Snappy block format, not the Snappy stream format. func Encode(dst, src []byte) []byte { if n := MaxEncodedLen(len(src)); n < 0 { panic(ErrTooLarge) } else if len(dst) < n { dst = make([]byte, n) } // The block starts with the varint-encoded length of the decompressed bytes. d := binary.PutUvarint(dst, uint64(len(src))) for len(src) > 0 { p := src src = nil if len(p) > maxBlockSize { p, src = p[:maxBlockSize], p[maxBlockSize:] } if len(p) < minNonLiteralBlockSize { d += emitLiteral(dst[d:], p) } else { d += encodeBlock(dst[d:], p) } } return dst[:d] } // inputMargin is the minimum number of extra input bytes to keep, inside // encodeBlock's inner loop. On some architectures, this margin lets us // implement a fast path for emitLiteral, where the copy of short (<= 16 byte) // literals can be implemented as a single load to and store from a 16-byte // register. That literal's actual length can be as short as 1 byte, so this // can copy up to 15 bytes too much, but that's OK as subsequent iterations of // the encoding loop will fix up the copy overrun, and this inputMargin ensures // that we don't overrun the dst and src buffers. const inputMargin = 16 - 1 // minNonLiteralBlockSize is the minimum size of the input to encodeBlock that // could be encoded with a copy tag. This is the minimum with respect to the // algorithm used by encodeBlock, not a minimum enforced by the file format. // // The encoded output must start with at least a 1 byte literal, as there are // no previous bytes to copy. A minimal (1 byte) copy after that, generated // from an emitCopy call in encodeBlock's main loop, would require at least // another inputMargin bytes, for the reason above: we want any emitLiteral // calls inside encodeBlock's main loop to use the fast path if possible, which // requires being able to overrun by inputMargin bytes. Thus, // minNonLiteralBlockSize equals 1 + 1 + inputMargin. // // The C++ code doesn't use this exact threshold, but it could, as discussed at // https://groups.google.com/d/topic/snappy-compression/oGbhsdIJSJ8/discussion // The difference between Go (2+inputMargin) and C++ (inputMargin) is purely an // optimization. It should not affect the encoded form. This is tested by // TestSameEncodingAsCppShortCopies. const minNonLiteralBlockSize = 1 + 1 + inputMargin // MaxEncodedLen returns the maximum length of a snappy block, given its // uncompressed length. // // It will return a negative value if srcLen is too large to encode. func MaxEncodedLen(srcLen int) int { n := uint64(srcLen) if n > 0xffffffff { return -1 } // Compressed data can be defined as: // compressed := item* literal* // item := literal* copy // // The trailing literal sequence has a space blowup of at most 62/60 // since a literal of length 60 needs one tag byte + one extra byte // for length information. // // Item blowup is trickier to measure. Suppose the "copy" op copies // 4 bytes of data. Because of a special check in the encoding code, // we produce a 4-byte copy only if the offset is < 65536. Therefore // the copy op takes 3 bytes to encode, and this type of item leads // to at most the 62/60 blowup for representing literals. // // Suppose the "copy" op copies 5 bytes of data. If the offset is big // enough, it will take 5 bytes to encode the copy op. Therefore the // worst case here is a one-byte literal followed by a five-byte copy. // That is, 6 bytes of input turn into 7 bytes of "compressed" data. // // This last factor dominates the blowup, so the final estimate is: n = 32 + n + n/6 if n > 0xffffffff { return -1 } return int(n) } var errClosed = errors.New("snappy: Writer is closed") // NewWriter returns a new Writer that compresses to w. // // The Writer returned does not buffer writes. There is no need to Flush or // Close such a Writer. // // Deprecated: the Writer returned is not suitable for many small writes, only // for few large writes. Use NewBufferedWriter instead, which is efficient // regardless of the frequency and shape of the writes, and remember to Close // that Writer when done. func NewWriter(w io.Writer) *Writer { return &Writer{ w: w, obuf: make([]byte, obufLen), } } // NewBufferedWriter returns a new Writer that compresses to w, using the // framing format described at // https://github.com/google/snappy/blob/master/framing_format.txt // // The Writer returned buffers writes. Users must call Close to guarantee all // data has been forwarded to the underlying io.Writer. They may also call // Flush zero or more times before calling Close. func NewBufferedWriter(w io.Writer) *Writer { return &Writer{ w: w, ibuf: make([]byte, 0, maxBlockSize), obuf: make([]byte, obufLen), } } // Writer is an io.Writer that can write Snappy-compressed bytes. // // Writer handles the Snappy stream format, not the Snappy block format. type Writer struct { w io.Writer err error // ibuf is a buffer for the incoming (uncompressed) bytes. // // Its use is optional. For backwards compatibility, Writers created by the // NewWriter function have ibuf == nil, do not buffer incoming bytes, and // therefore do not need to be Flush'ed or Close'd. ibuf []byte // obuf is a buffer for the outgoing (compressed) bytes. obuf []byte // wroteStreamHeader is whether we have written the stream header. wroteStreamHeader bool } // Reset discards the writer's state and switches the Snappy writer to write to // w. This permits reusing a Writer rather than allocating a new one. func (w *Writer) Reset(writer io.Writer) { w.w = writer w.err = nil if w.ibuf != nil { w.ibuf = w.ibuf[:0] } w.wroteStreamHeader = false } // Write satisfies the io.Writer interface. func (w *Writer) Write(p []byte) (nRet int, errRet error) { if w.ibuf == nil { // Do not buffer incoming bytes. This does not perform or compress well // if the caller of Writer.Write writes many small slices. This // behavior is therefore deprecated, but still supported for backwards // compatibility with code that doesn't explicitly Flush or Close. return w.write(p) } // The remainder of this method is based on bufio.Writer.Write from the // standard library. for len(p) > (cap(w.ibuf)-len(w.ibuf)) && w.err == nil { var n int if len(w.ibuf) == 0 { // Large write, empty buffer. // Write directly from p to avoid copy. n, _ = w.write(p) } else { n = copy(w.ibuf[len(w.ibuf):cap(w.ibuf)], p) w.ibuf = w.ibuf[:len(w.ibuf)+n] w.Flush() } nRet += n p = p[n:] } if w.err != nil { return nRet, w.err } n := copy(w.ibuf[len(w.ibuf):cap(w.ibuf)], p) w.ibuf = w.ibuf[:len(w.ibuf)+n] nRet += n return nRet, nil } func (w *Writer) write(p []byte) (nRet int, errRet error) { if w.err != nil { return 0, w.err } for len(p) > 0 { obufStart := len(magicChunk) if !w.wroteStreamHeader { w.wroteStreamHeader = true copy(w.obuf, magicChunk) obufStart = 0 } var uncompressed []byte if len(p) > maxBlockSize { uncompressed, p = p[:maxBlockSize], p[maxBlockSize:] } else { uncompressed, p = p, nil } checksum := crc(uncompressed) // Compress the buffer, discarding the result if the improvement // isn't at least 12.5%. compressed := Encode(w.obuf[obufHeaderLen:], uncompressed) chunkType := uint8(chunkTypeCompressedData) chunkLen := 4 + len(compressed) obufEnd := obufHeaderLen + len(compressed) if len(compressed) >= len(uncompressed)-len(uncompressed)/8 { chunkType = chunkTypeUncompressedData chunkLen = 4 + len(uncompressed) obufEnd = obufHeaderLen } // Fill in the per-chunk header that comes before the body. w.obuf[len(magicChunk)+0] = chunkType w.obuf[len(magicChunk)+1] = uint8(chunkLen >> 0) w.obuf[len(magicChunk)+2] = uint8(chunkLen >> 8) w.obuf[len(magicChunk)+3] = uint8(chunkLen >> 16) w.obuf[len(magicChunk)+4] = uint8(checksum >> 0) w.obuf[len(magicChunk)+5] = uint8(checksum >> 8) w.obuf[len(magicChunk)+6] = uint8(checksum >> 16) w.obuf[len(magicChunk)+7] = uint8(checksum >> 24) if _, err := w.w.Write(w.obuf[obufStart:obufEnd]); err != nil { w.err = err return nRet, err } if chunkType == chunkTypeUncompressedData { if _, err := w.w.Write(uncompressed); err != nil { w.err = err return nRet, err } } nRet += len(uncompressed) } return nRet, nil } // Flush flushes the Writer to its underlying io.Writer. func (w *Writer) Flush() error { if w.err != nil { return w.err } if len(w.ibuf) == 0 { return nil } w.write(w.ibuf) w.ibuf = w.ibuf[:0] return w.err } // Close calls Flush and then closes the Writer. func (w *Writer) Close() error { w.Flush() ret := w.err if w.err == nil { w.err = errClosed } return ret }