@@ -70,8 +70,8 @@ module Codec.CBOR.Decoding
7070 , peekTokenType -- :: Decoder s TokenType
7171 , TokenType (.. )
7272
73- -- ** Special operations
74- , peekAvailable -- :: Decoder s Int
73+ -- ** Byte offsets and byte spans within the input stream
74+ -- $input
7575 , ByteOffset
7676 , peekByteOffset -- :: Decoder s ByteOffset
7777 , decodeWithByteOffsets
@@ -80,6 +80,7 @@ module Codec.CBOR.Decoding
8080 , closeByteSpan -- :: Decoder s ()
8181 , peekByteSpan -- :: Decoder s LazyByteString
8282 , decodeWithByteSpan
83+ , peekAvailable -- :: Decoder s Int
8384
8485 -- ** Canonical CBOR
8586 -- $canonical
@@ -961,11 +962,80 @@ peekTokenType = Decoder (\k -> return (PeekTokenType (\tk -> k tk)))
961962-- | Peek and return the length of the current buffer that we're
962963-- running our decoder on.
963964--
965+ -- This is not typically very useful. One case where it can be useful is to
966+ -- help mitigate resource attacks when decoding untrusted data that uses length
967+ -- prefixed encodings. For example, an array: when decoding an array it is most
968+ -- efficient to allocate a whole array and then fill it in while decoding each
969+ -- element. The serialised representation will provide a length, however this
970+ -- information cannot be trusted and used to pre-allocate the whole array,
971+ -- as an attacker could trivially force memory exhaustion. Knowing a lower bound
972+ -- on the amount of data the user has supplied can be useful to allow allocating
973+ -- in reasonable sized batches, which is much more efficient than having to use
974+ -- a fully dynamicly-sized intermediate representation (like a list). The
975+ -- remaining size of the current input buffer is one such lower bound.
976+ --
964977-- @since 0.2.0.0
965978peekAvailable :: Decoder s Int
966979peekAvailable = Decoder (\ k -> return (PeekAvailable (\ len# -> k (I # len# ))))
967980{-# INLINE peekAvailable #-}
968981
982+ --------------------------------------------------------------
983+ -- Byte offsets and byte spans
984+
985+ -- $input
986+ --
987+ -- Sometimes it is important not just to be able to decode a CBOR serialisation
988+ -- but also to access the portion of the input that this corresponds to.
989+ --
990+ -- For example, in secuity-related applications it may be necessary to hash or
991+ -- verify a signature of a serialised CBOR term. In these cases it is vital to
992+ -- use the original serialised representation from the input data. While it is
993+ -- possible to re-serialise, that is not guaranteed to produce the same
994+ -- serialised representation, due to redundancy in representation. Even when
995+ -- using canonical CBOR it is still be best practice to use the original input.
996+ --
997+ -- The library provides two ways to get at the original input: indirectly using
998+ -- byte offsets and directly using byte spans.
999+ --
1000+ -- 'peekByteOffset' reports the current byte offset in the input data. This can
1001+ -- be used before and after decoding a term to find the byte offsets of the
1002+ -- term's original serialised representation.
1003+ --
1004+ -- > !before <- peekByteOffset
1005+ -- > x <- decode
1006+ -- > !after <- peekByteOffset
1007+ --
1008+ -- This pattern is captured by 'decodeWithByteOffsets'.
1009+ --
1010+ -- Externally to the decoder, if there is access to the original input then the
1011+ -- byte offsets may be used to select the span corresponding to the term.
1012+ --
1013+ -- Alternatively, the library provides a way to get at the input bytes directly.
1014+ -- This uses a scheme of manually marking the beginning and end of spans.
1015+ --
1016+ -- > openByteSpan
1017+ -- > x <- decode
1018+ -- > bytes <- peekByteSpan
1019+ -- > closeByteSpan
1020+ --
1021+ -- This pattern is captured by 'decodeWithByteSpan'.
1022+ --
1023+ -- The beginning of a byte span is marked using 'openByteSpan'. Then
1024+ -- 'peekByteSpan' can be used to return the byte span from where the span was
1025+ -- opened, to the deccoder's current point in the input stream.
1026+ --
1027+ -- It is important to pair every 'openByteSpan' with a corresponding
1028+ -- 'closeByteSpan'. The decoder maintains a stack of open byte spans. This
1029+ -- serves two purposes: 1. to allow nested use of byte spans, and 2. to ensure
1030+ -- input buffers are not retained unnecessarily.
1031+ --
1032+ -- Normally, the 'Decoder' does not retain input buffers after they have been
1033+ -- consumed. This allows decoders to be executed on an input stream
1034+ -- incrementally, while only keeping one input buffer in memory at once. This
1035+ -- behaviour is modified using 'openByteSpan': all input will be retained until
1036+ -- the corresponding 'closeByteSpan'.
1037+ --
1038+
9691039
9701040-- | A 0-based offset within the overall byte sequence that makes up the
9711041-- input to the 'Decoder'.
@@ -1019,19 +1089,27 @@ decodeWithByteOffsets da = do
10191089--
10201090type ByteSpan = LBS. ByteString
10211091
1022- -- |
1092+ -- | Mark the start of a new byte span at the decoder's current point in the
1093+ -- input byte stream. After this, use 'peekByteSpan' to get the bytes from
1094+ -- the most recent opening mark.
1095+ --
1096+ -- The use of 'openByteSpan' can be nested, and each use of 'openByteSpan'
1097+ -- should be matched by a corresponding 'closeByteSpan'.
10231098--
10241099-- @since 0.3.0.0
10251100openByteSpan :: Decoder s ()
10261101openByteSpan = Decoder (\ k -> return (OpenByteSpan (k () )))
10271102
1028- -- |
1103+ -- | Close (forget) the most recent byte span created by 'openByteSpan'. This
1104+ -- is important for proper nesting and to avoid retaining decoder input data
1105+ -- indefinitely.
10291106--
10301107-- @since 0.3.0.0
10311108closeByteSpan :: Decoder s ()
10321109closeByteSpan = Decoder (\ k -> return (CloseByteSpan (k () )))
10331110
1034- -- |
1111+ -- | Return the byte span since the most recent use of 'openByteSpan' (that has
1112+ -- not yet been closed by 'closeByteSpan').
10351113--
10361114-- Note: the 'ByteSpan' returned is a slice of the original decoder input
10371115-- stream, and thus will retain the input buffers. If you need to retain the
@@ -1047,7 +1125,7 @@ peekByteSpan = Decoder (\k -> return (PeekByteSpan k))
10471125--
10481126-- > openByteSpan
10491127-- > x <- decode
1050- -- > !after <- peekByteSpan
1128+ -- > bytes <- peekByteSpan
10511129-- > closeByteSpan
10521130--
10531131-- @since 0.3.0.0
0 commit comments