// Copyright 2016 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

// Rice-Golomb decoder for blacklist updates.
// Details at: https://en.wikipedia.org/wiki/Golomb_coding

#ifndef COMPONENTS_SAFE_BROWSING_DB_V4_RICE_H_
#define COMPONENTS_SAFE_BROWSING_DB_V4_RICE_H_

#include <ostream>
#include <string>
#include <vector>
#include "base/gtest_prod_util.h"

#if defined(USE_SYSTEM_PROTOBUF)
#include <google/protobuf/repeated_field.h>
#else
#include "third_party/protobuf/src/google/protobuf/repeated_field.h"
#endif

namespace safe_browsing {

// Enumerate different failure events while decoding the Rice-encoded string
// sent by the server for histogramming purposes. DO NOT CHANGE THE ORDERING OF
// THESE VALUES.
enum V4DecodeResult {
  // No error.
  DECODE_SUCCESS = 0,

  // Exceeded the number of entries to expect.
  DECODE_NO_MORE_ENTRIES_FAILURE = 1,

  // Requested to decode >32 bits.
  DECODE_REQUESTED_TOO_MANY_BITS_FAILURE = 2,

  // All bits had already been read and interpreted in the encoded string.
  DECODE_RAN_OUT_OF_BITS_FAILURE = 3,

  // The num_entries argument to DecodePrefixes or DecodeIntegers was negative.
  NUM_ENTRIES_NEGATIVE_FAILURE = 4,

  // Rice-encoding parameter was non-positive when the number of encoded entries
  // was > 0.
  RICE_PARAMETER_NON_POSITIVE_FAILURE = 5,

  // |encoded_data| was empty when the number of encoded entries was > 0.
  ENCODED_DATA_UNEXPECTED_EMPTY_FAILURE = 6,

  // decoded value had an integer overflow, which is unexpected.
  DECODED_INTEGER_OVERFLOW_FAILURE = 7,

  // Memory space for histograms is determined by the max.  ALWAYS
  // ADD NEW VALUES BEFORE THIS ONE.
  DECODE_RESULT_MAX
};

class V4RiceDecoder {
 public:
  // Decodes the Rice-encoded string in |encoded_data| as a list of integers
  // and stores them in |out|. |rice_parameter| is the exponent of 2 for
  // calculating 'M', |first_value| is the first value in the output sequence,
  // |num_entries| is the number of subsequent encoded entries. Each decoded
  // value is a positive offset from the previous value.
  // So, for instance, if the unencoded sequence is: [3, 7, 25], then
  // |first_value| = 3, |num_entries| = 2 and decoding the |encoded_data| will
  // produce the offsets: [4, 18].
  static V4DecodeResult DecodeIntegers(
      const ::google::protobuf::int64 first_value,
      const ::google::protobuf::int32 rice_parameter,
      const ::google::protobuf::int32 num_entries,
      const std::string& encoded_data,
      ::google::protobuf::RepeatedField<::google::protobuf::int32>* out);

  // Decodes the Rice-encoded string in |encoded_data| as a string of 4-byte
  // hash prefixes and stores them in |out|. The rest of the arguments are the
  // same as for |DecodeIntegers|.
  // Important: |out| is only meant to be used as a concatenated list of sorted
  // 4-byte hash prefixes, not as a vector of uint32_t values.
  // This method does the following:
  // 1. Rice-decode the |encoded_data| as a list of uint32_t values.
  // 2. Flip the endianness (on little-endian machines) of each of these
  //    values. This is done because when a hash prefix is represented as a
  //    uint32_t, the bytes get reordered. This generates the hash prefix that
  //    the server would have sent in the absence of Rice-encoding.
  // 3. Sort the resulting list of uint32_t values.
  // 4. Flip the endianness once again since the uint32_t are expected to be
  //    consumed as a concatenated list of 4-byte hash prefixes, when merging
  //    the
  //    update with the existing state.
  static V4DecodeResult DecodePrefixes(
      const ::google::protobuf::int64 first_value,
      const ::google::protobuf::int32 rice_parameter,
      const ::google::protobuf::int32 num_entries,
      const std::string& encoded_data,
      std::vector<uint32_t>* out);

  virtual ~V4RiceDecoder();

  std::string DebugString() const;

 private:
  FRIEND_TEST_ALL_PREFIXES(V4RiceTest, TestDecoderGetNextWordWithNoData);
  FRIEND_TEST_ALL_PREFIXES(V4RiceTest, TestDecoderGetNextBitsWithNoData);
  FRIEND_TEST_ALL_PREFIXES(V4RiceTest, TestDecoderGetNextValueWithNoData);
  FRIEND_TEST_ALL_PREFIXES(V4RiceTest, TestDecoderGetNextValueWithNoEntries);
  friend class V4RiceTest;

  // Validate some of the parameters passed to the decode methods.
  static V4DecodeResult ValidateInput(
      const ::google::protobuf::int32 rice_parameter,
      const ::google::protobuf::int32 num_entries,
      const std::string& encoded_data);

  // The |rice_parameter| is the exponent of 2 for calculating 'M',
  // |num_entries| is the number of encoded entries in the |encoded_data| and
  // |encoded_data| is the Rice-encoded string to decode.
  V4RiceDecoder(const ::google::protobuf::int32 rice_parameter,
                const ::google::protobuf::int32 num_entries,
                const std::string& encoded_data);

  // Returns true until |num_entries| entries have been decoded.
  bool HasAnotherValue() const;

  // Populates |value| with the next 32-bit unsigned integer decoded from
  // |encoded_data|.
  V4DecodeResult GetNextValue(uint32_t* value);

  // Reads in up to 32 bits from |encoded_data| into |word|, from which
  // subsequent GetNextBits() calls read bits.
  V4DecodeResult GetNextWord(uint32_t* word);

  // Reads |num_requested_bits| into |x| from |current_word_| and advances it
  // if needed by calling GetNextWord().
  V4DecodeResult GetNextBits(unsigned int num_requested_bits, uint32_t* x);

  // Reads |num_requested_bits| from |current_word_|.
  uint32_t GetBitsFromCurrentWord(unsigned int num_requested_bits);

  // The Rice parameter, which is the exponent of two for calculating 'M'. 'M'
  // is used as the base to calculate the quotient and remainder in the
  // algorithm.
  const unsigned int rice_parameter_;

  // The number of entries encoded in the data stream.
  ::google::protobuf::int32 num_entries_;

  // The Rice-encoded string.
  const std::string data_;

  // Represents how many total bytes have we read from |data_| into
  // |current_word_|.
  unsigned int data_byte_index_;

  // Represents the number of bits that we have read from |current_word_|. When
  // this becomes 32, which is the size of |current_word_|, a new
  // |current_word_| needs to be read from |data_|.
  unsigned int current_word_bit_index_;

  // The 32-bit value read from |data_|. All bit reading operations operate on
  // |current_word_|.
  uint32_t current_word_;
};

std::ostream& operator<<(std::ostream& os, const V4RiceDecoder& rice_decoder);

}  // namespace safe_browsing

#endif  // COMPONENTS_SAFE_BROWSING_DB_V4_RICE_H_
