A global dictionary that holds information about what Caffe2 modules have been loaded in the current runtime, and also utility functions to load modules. More...

Namespaces
	detail
	To make a c10 operator "C10Add" callable from caffe2 as "C2MyAddOpName", just write.

Data Structures
struct	_CaffeHighestPreallocatedTypeId

struct	AbsFunctor

struct	AbsGradientFunctor

struct	AbstractLengthsDef

class	AbstractLengthsGradientOp

class	AbstractLengthsOp
	Segment reduction op with optional fused embedding lookup. More...

class	AbstractLengthsWithMainInputAndForwardOutputGradientOp

class	AbstractLengthsWithMainInputGradientOp

struct	AbstractReduceBackDef

struct	AbstractReduceFrontDef

class	AbstractReduceFrontOrBackGradientOp

class	AbstractReduceFrontOrBackOp
	Simple non-segmented reduction over the first few dimensions of the tensor. More...

struct	AbstractSortedSegmentDef

class	AbstractSortedSegmentGradientOp

class	AbstractSortedSegmentOp
	Segment reduction op with optional fused embedding lookup. More...

struct	AbstractSortedSegmentRangeDef

class	AbstractSortedSegmentRangeGradientOp

class	AbstractSortedSegmentRangeOp
	Base implementation for segment reduction op that leverages continuity of the data. More...

struct	AbstractSparseLengthsDef

struct	AbstractSparseSortedSegmentDef

struct	AbstractSparseUnsortedSegmentDef

struct	AbstractUnsortedSegmentDef

class	AbstractUnsortedSegmentGradientOp

class	AbstractUnsortedSegmentOp
	Unsorted segment reduction op with optional fused embedding lookup. More...

class	AccumulateHistogramOp

class	AccumulateInputGradientOp

class	AccumulateOp

class	AccuracyOp

struct	AcosFunctor

struct	AcosGradientFunctor

class	AdadeltaOp

class	AdagradOp

class	AdamOp

class	AddDNNLowPOp

struct	AddFunctor

class	AddPaddingOp

class	AdjustBatchOp

class	AffineChannelGradientOp

class	AffineChannelOp

class	AlgorithmsCache

class	AliasOp
	Alias op makes the output and the input share the same underlying storage. More...

struct	AlignedDeleter

struct	AllocAligned

class	AlternateLearningRate

class	APMeterOp

struct	ArgMaxReducer

struct	ArgMinReducer

class	ArgOp

class	ArgumentHelper
	A helper class to index into arguments. More...

struct	AsinFunctor

struct	AsinGradientFunctor

class	AssertOp

class	AsyncNetBase

class	AsyncNetExecutorHelper

class	AsyncSchedulingNet

class	AsyncTask

class	AsyncTaskFuture

class	AsyncTaskGraph

class	AsyncTaskGraphBase

struct	AtanFunctor

struct	AtanGradientFunctor

class	AtomicIterOp

class	AveragedLoss

class	AveragedLossGradient

struct	AveragePoolFunctor

class	AvgExportedStat

class	BackendTransformerBase

class	BaseInputAccessor

class	BaseReducer

class	BaseReducerGradient

class	BatchBoxCoxOp

class	BatchBucketizeOp

class	BatchBucketOneHotOp

class	BatchDenseToSparseOp

class	BatchGatherGradientOp

class	BatchGatherOp

class	BatchMatMulDNNLowPOp

class	BatchMatMulOp

class	BatchMomentsGradientOp

class	BatchMomentsOp

class	BatchOneHotOp

class	BatchPermutationDNNLowPOp

class	BatchPermutationGradientOp

class	BatchPermutationOp

class	BatchSparseToDenseOp

class	BatchToSpaceOp

class	BBoxTransformOp

class	BernoulliJSDGradientOp

class	BernoulliJSDOp

class	BinaryElementwiseDNNLowPOp

class	BinaryElementwiseWithArgsGradientOp

class	BinaryElementwiseWithArgsGradientOp< NumericTypes, CPUContext, BinaryFunctorWithDefaultCtor< DivFunctor< CPUContext > >, SameTypeAsInput, SameTypeAsInput >

class	BinaryElementwiseWithArgsOp

struct	BinaryFunctorWithDefaultCtor

class	BisectPercentileOp

class	Blob
	Blob is a general container that hosts a typed pointer. More...

class	BlobDeserializerBase
	BlobDeserializerBase is an abstract class that deserializes a blob from a BlobProto or a TensorProto. More...

class	BlobSerializerBase
	BlobSerializerBase is an abstract class that serializes a blob to a string. More...

class	BlobsQueue

struct	BlobStatGetter

struct	BlobStatRegistry

class	BlockingCounter

class	BooleanMaskOp

class	BooleanUnmaskOp

class	BoundShapeInferencer

struct	BoundShapeSpec

class	BoxWithNMSLimitOp

class	BRGNCHWCToPackedInt8BGRAStylizerDeprocessOp

class	BufferedTokenizer

class	ByteWeightDequantOp

class	Caffe2Annotation

class	Caffe2ModuleTestDynamicDummyOp

struct	CastHelper

struct	CastHelper< std::string, SrcType >

class	CastOp

struct	CbrtFunctor

struct	CbrtGradientFunctor

class	CeilOp

class	ChannelBackpropStatsOp

class	ChannelShuffleDNNLowPOp

class	ChannelShuffleGradientOp

class	ChannelShuffleOp

class	ChannelStatsOp

struct	CharRange

class	CheckCounterDoneOp

class	CheckpointOp

class	ClipGradientOp

class	ClipOp

class	ClipTensorByScalingOp

class	CloseBlobsQueueOp

class	CloseRebatchingQueueOp

class	Col2ImOp

class	CollectAndDistributeFpnRpnProposalsOp

class	CommonSubexpressionEliminationTransform
	Common Subexpression Elimination. More...

class	CompositeLearningRate

class	CompositeLearningRateItem

class	ConcatDNNLowPOp

class	ConcatOp

class	ConditionalOp

class	ConstantFillOp

class	ConstantWarmupLearningRate

struct	ConvArgs

class	ConvDNNLowPAcc16Op
	Quantized Conv operator with 16-bit accumulation. More...

class	ConvDNNLowPOp

class	ConvDNNLowPPackWeightOp
	Pack a weight matrix that can be used by DNNLOWP Int8Conv operators. More...

class	Converter

class	ConvGradientOp

class	ConvOp

class	ConvPoolDNNLowPOpBase

class	ConvPoolOpBase

class	ConvReluOp

class	ConvToNNPackTransform

class	ConvTransposeGradientOp

class	ConvTransposeOp

class	ConvTransposeUnpoolBase

class	CopyCPUToIDEEPOp

class	CopyIDEEPToCPUOp

class	CopyOnDeviceLikeOp

class	CopyOp

struct	CosFunctor

struct	CosGradientFunctor

struct	CoshFunctor

struct	CoshGradientFunctor

class	CosineEmbeddingCriterionGradientOp

class	CosineEmbeddingCriterionOp

class	CosineSimilarityGradientOp

class	CosineSimilarityOp

class	CountDownOp

class	Counter

class	CountUpOp

class	CPUContext
	The CPU Context, representing the bare minimum of what a Context class in Caffe2 should implement. More...

struct	CPUEventWrapper

class	CpuId
	Identification of an Intel CPU. More...

class	CPUSparseLengthsReductionOp

class	CreateBlobsQueueOp

class	CreateCounterOp

class	CreateDBOp

class	CreateMapOp

class	CreateRebatchingQueueOp

class	CreateScopeOp

class	CreateTextFileReaderOp

class	CrossEntropyGradientOp

class	CrossEntropyOp

class	CTCBeamSearchDecoderOp

class	CTCGreedyDecoderOp

struct	CubeFunctor

struct	CubeGradientFunctor

class	CUDAContext

struct	CudaDevicePropWrapper

struct	CudaEventWrapper

class	CUDARecurrentNetworkExecutor

class	CudaRTCFunction

class	CuDNNActivationGradientOp

class	CuDNNActivationGradientOp< CUDNN_ACTIVATION_ELU >

class	CuDNNActivationOp

class	CuDNNActivationOp< CUDNN_ACTIVATION_ELU >

class	CuDNNActivationOpBase

class	CudnnConvGradientOp

class	CudnnConvOp

class	CudnnConvOpBase

class	CudnnConvTransposeGradientOp

class	CudnnConvTransposeOp

class	CudnnConvTransposeOpBase

class	cudnnFilterDescWrapper

class	CuDNNLRNGradientOp

class	CuDNNLRNOp

class	CuDNNSoftmaxGradientOp

class	CuDNNSoftmaxOp

class	CuDNNState

class	cudnnTensorDescWrapper
	cudnnTensorDescWrapper is the placeholder that wraps around a cudnnTensorDescriptor_t, allowing us to do descriptor change as-needed during runtime. More...

class	cudnnTypeWrapper
	cudnnTypeWrapper is a wrapper class that allows us to refer to the cudnn type in a template function. More...

class	cudnnTypeWrapper< at::Half >

class	cudnnTypeWrapper< double >

class	cudnnTypeWrapper< float >

class	CuDNNWeightedSumOp

struct	CuDNNWorkspace
	CuDNNWorkspace is a wrapper around a raw cuda pointer that holds the cudnn scratch space. More...

class	CuDNNWrapper
	CuDNNWrapper is a class that wraps the cudnn handles and cudnn workspaces. More...

class	DataCoupleOp

class	DBExistsOp

class	DecodedFrame

class	DefaultEngine

class	DeformConvGradientOp

class	DeformConvOp

class	DeformConvOpBase

class	DenseVectorToIdListOp

class	DequantizeDNNLowPOp

class	DequeueBlobsOp

class	DequeueRebatchingQueueOp

class	DetailedExportedStat

struct	DeviceTypeRegisterer

class	DiagonalFillOp

struct	DispatchHelper

struct	DispatchHelper< FixedValues< FirstVal, Values... >, ExtraArgs... >

struct	DispatchHelper< FixedValues<>, ExtraArgs... >

struct	DivFunctor

class	DNNLowPOp
	A convenient base class for C2 operators with DNNLOWP engine. More...

class	DoOp

class	DotProductGradientOp

class	DotProductOp

class	DotProductWithPaddingGradientOp

class	DotProductWithPaddingOp

class	DropoutGradientOp

class	DropoutOp

struct	EigenPowFunctor

class	ElementwiseLinearDNNLowPOp

class	ElementwiseLinearGradientOp

class	ElementwiseLinearOp

class	ElementwiseRTCOp
	A GPU operator that can generate limited elementwise operations. More...

struct	EluFunctor

struct	EluGradientFunctor

class	EnforceFiniteOp

class	EnqueueBlobsOp

class	EnqueueRebatchingQueueOp

class	EnsureClippedOp

class	EnsureCPUOutputOp

class	EnsureDenseOp
	Pass inputs to outputs. More...

struct	ErfFunctor

struct	ErfGradientFunctor

class	Event

struct	EventCreateFunctionRegisterer

struct	EventErrorMessageFunctionRegisterer

struct	EventFinishFunctionRegisterer

struct	EventQueryFunctionRegisterer

struct	EventRecordFunctionRegisterer

struct	EventResetFunctionRegisterer

struct	EventSetCallbackFunctionRegisterer

struct	EventSetFinishedFunctionRegisterer

struct	EventWaitFunctionRegisterer

struct	ExecutionOptions

class	ExecutorHelper

class	ExpandDimsOp

class	ExpandGradientOp

class	ExpandOp

struct	ExpFunctor

class	ExpLearningRate

class	ExportedStat

struct	ExportedStatValue

class	FailOp

class	FeedBlobOp

class	FileReader

class	FileStoreHandler

class	FileStoreHandlerCreateOp

class	FillerOp

class	FindDuplicateElementsOp

class	FindOp

class	FixedDivisor

class	FixedDivisor< std::int32_t >

class	FixedLearningRate

struct	FixedType

struct	FixedValues

class	FlattenOp

class	FlattenToVecOp

class	FlexibleTopKGradientOp

class	FlexibleTopKOp

class	Float16ConstantFillOp

class	Float16UniformFillOp

class	FloatToFused8BitRowwiseQuantizedOp

class	FloatToFusedRandRowwiseQuantizedOp

class	FloatToHalfOp

class	FloatToRowwiseQuantized8BitsOp

class	FloorOp

struct	ForEach
	ForEach is a unary functor that forwards each element of the input array into the elementwise Functor provided, and gathers the results of each call into the resulting array. More...

class	FP16MomentumSGDUpdateOp

class	FP32MomentumSGDUpdateOp

class	FreeOp

class	FtrlOp

struct	FtrlParams

class	FullyConnectedDecompGradientOp

class	FullyConnectedDNNLowPAcc16Op
	Quantized FC operator with 16-bit accumulation. More...

class	FullyConnectedDNNLowPOp

class	FullyConnectedDNNLowPPackWeightOp

class	FullyConnectedFakeLowpFPOp

class	FullyConnectedGradientFakeLowpFPOp

class	FullyConnectedGradientOp

class	FullyConnectedOp

class	FullyConnectedOp_SPARSE

class	FullyConnectedOpDecomp

class	FullyConnectedOpPrune

class	FullyConnectedPruneGradientOp

class	FunHashGradientOp

class	FunHashOp

class	Fused8BitRowwiseQuantizedToFloatOp

class	FusedRandRowwiseQuantizedToFloatOp

class	GatherByKeyOp

class	GatherDNNLowPOp

class	GatherFused8BitRowwiseOp

class	GatherOp

class	GatherPaddingOp

class	GatherRangesOp

class	GatherRangesToDenseOp

class	GaussianFillOp

class	GenerateProposalsOp

struct	GenericTensorImplementation

struct	GetAddPaddingGradient

class	GetAveragedLossGradient

class	GetBatchGatherGradient

class	GetBatchPermutationGradient

class	GetBatchToSpaceGradient

class	GetBernoulliJSDGradient

class	GetCastGradient

class	GetCol2ImGradient

class	GetConvGradient

class	GetConvTransposeGradient

struct	GetCopyGradient

class	GetCosineSimilarityGradient

struct	GetCPUToGPUGradient

class	GetCrossEntropyGradient

class	GetDotProductGradient

class	GetDotProductWithPaddingGradient

class	GetDropoutGradient

struct	GetElementwiseLinearGradient

class	GetExpandDimsGradient

class	GetFCDecompGradient

class	GetFloatToHalfGradient

struct	GetGPUToCPUGradient

class	GetGroupSpatialSoftmaxGradient

class	GetGRUUnitGradient

class	GetHalfToFloatGradient

class	GetIm2ColGradient

class	GetInstanceNormGradient

class	GetIntegralImageGradient

class	GetL1DistanceGradient

class	GetLabelCrossEntropyGradient

class	GetLeakyReluGradient

class	GetLRNGradient

class	GetLSTMUnitGradient

struct	GetMakeTwoClassGradient

class	GetMatMulGradient

class	GetMeanGradient

struct	GetNanCheckGradient

struct	GetNegateGradientGradient

class	GetNormalizeGradient

class	GetPackSegmentsGradient

class	GetPadImageGradient

class	GetPoolGradient

class	GetPrependDimGradient

struct	GetRecurrentGradient

struct	GetRecurrentNetworkGradient

class	GetReduceBackMaxGradient

class	GetReduceBackSumGradient

class	GetReduceFrontMaxGradient

class	GetReduceFrontMeanGradient

class	GetReduceFrontSumGradient

struct	GetRemovePaddingGradient

class	GetResizeNearestGradient

class	GetReversePackedSegsGradient

class	GetRoIPoolGradient

class	GetSampleAsGradient

class	GetScaleGradient

class	GetSelectSmoothL1LossGradient

class	GetSeluGradient

class	GetSigmoidCrossEntropyLossGradient

struct	GetSigmoidCrossEntropyWithLogitsGradient

class	GetSigmoidFocalLossGradient

class	GetSmoothL1LossGradient

class	GetSoftmaxFocalLossGradient

class	GetSoftplusGradient

class	GetSpaceToBatchGradient

class	GetSquaredL2DistanceGradient

class	GetSquareRootDivideGradient

class	GetSqueezeGradient

class	GetSumElementsGradient

class	GetTopKGradient

class	GetTransposeGradient

class	GetUnpackSegmentsGradient

class	GetUpsampleBilinearGradient

class	GetUpsampleNearestGradient

struct	GetWeightedSigmoidCrossEntropyWithLogitsGradient

struct	GetZeroGradientOpGradient

class	GFtrlOp

struct	GFtrlParams

class	GivenTensorByteStringToUInt8FillOp

class	GivenTensorFillOp

class	GlobalInitIsCalledGuard

class	GluOp

class	GPUFallbackOpEx
	A templated class to allow one to wrap a CPU operator as a CUDA operator. More...

class	GradientMakerBase

struct	GradientNotImplementedYet
	A helper class to indicate that the gradient mechanism is not ready. More...

struct	GradientOpsMeta
	A struct that holds the gradient operators and related gradient maps. More...

struct	GradientWrapper

class	GroupNormDNNLowPOp

class	GroupNormGradientOp

class	GroupNormOp

class	GroupSpatialSoftmaxGradientOp

class	GroupSpatialSoftmaxOp

class	GRUUnitGradientOp

class	GRUUnitOp

class	HalfToFloatOp

struct	HardSigmoidFunctor

struct	HardSigmoidGradientFunctor

class	HasElementsOp

class	HasScopeOp

class	HeatmapMaxKeypointOp

class	HillLearningRate

class	HistogramNetObserver

class	HistogramObserver
	Given min/max, collect histogram. More...

class	HSoftmaxGradientOp

class	HSoftmaxOp

class	HSoftmaxOpBase

class	HSoftmaxSearchOp

class	HuffmanTreeHierarchyOp

class	IDEEPAdamOp

class	IDEEPConcatOp

class	IDEEPContext

class	IDEEPConvFusionOp

class	IDEEPConvGradientOp

class	IDEEPConvOp

class	IDEEPConvPoolOpBase

class	IDEEPConvTransposeGradientOp

class	IDEEPConvTransposeOp

class	IDEEPConvTransposeUnpoolBase

class	IDEEPCopyOp

class	IDEEPCreateBlobsQueueOp

class	IDEEPDropoutGradientOp

class	IDEEPDropoutOp

class	IDEEPExpandDimsOp

class	IDEEPFallbackOp
	A templated class to allow one to wrap a CPU operator as an IDEEP operator. More...

class	IDEEPFullyConnectedGradientOp

class	IDEEPFullyConnectedOp

class	IDEEPLRNGradientOp

class	IDEEPLRNOp

class	IDEEPMomentumSGDOp

class	IDEEPMomentumSGDUpdateOp

class	IDEEPOperator

class	IDEEPPoolGradientOp

class	IDEEPPoolOp

class	IDEEPReluGradientOp

class	IDEEPReluOp

class	IDEEPReshapeOp

class	IDEEPSafeEnqueueBlobsOp

class	IDEEPShapeOp

class	IDEEPSigmoidGradientOp

class	IDEEPSigmoidOp

class	IDEEPSpatialBNGradientOp

class	IDEEPSpatialBNOp

class	IDEEPSplitOp

class	IDEEPSqueezeOp

class	IDEEPSumOp

class	IDEEPWeightedSumOp

class	IfOp

class	Im2ColOp

class	ImageInputOp

struct	Index

struct	IndexBase

class	IndexCreateOp

class	IndexDeserializer

class	IndexFreezeOp

class	IndexGetOp

class	IndexHashOp

class	IndexLoadOp

class	IndexSerializer

class	IndexSizeOp

class	IndexStoreOp

class	InitRegisterer

class	InstanceNormGradientOp

class	InstanceNormOp

struct	Int8ConvDNNLowPPackedWeightBlob
	Packed weight matrix for DNNLOWP Int8Conv operator. More...

struct	Int8FCDNNLowPPackedWeightBlob
	Packed weight matrix for DNNLOWP Int8FC operator. More...

class	IntegralImageGradientOp

class	IntegralImageOp

class	InvLearningRate

class	IsEmptyOp

class	IsMemberOfOp

class	IsMemberOfValueHolder

class	IsNanOp

class	IterOp

class	KeySplitOp

class	KeyValueToMapOp

class	L1DistanceGradientOp

class	L1DistanceOp

struct	L1Reducer

struct	L2Reducer

class	LabelCrossEntropyGradientOp

class	LabelCrossEntropyOp

class	LambdaRankNdcgGradientOp

class	LambdaRankNdcgOp

class	LarsOp

class	LayerNormGradientOp

class	LayerNormOp

class	LeakyReluGradientOp

class	LeakyReluOp

class	LearningRateAdaptionOp

class	LearningRateFunctor

class	LearningRateOp

class	LengthsGatherOp

struct	LengthsOpGetGradient

class	LengthsPadOp

class	LengthsPartitionOp

class	LengthsRangeFillOp

class	LengthsSplitOp

class	LengthsTileOp

class	LengthsTopKGradientOp

class	LengthsTopKOp

class	LengthsToRangesOp

class	LengthsToSegmentIdsOp

class	LengthsToShapeOp

class	LengthsToWeightsOp

class	LinearWarmupLearningRate

class	LoadOp

class	LocallyConnectedGradientOp

class	LocallyConnectedOp

class	LogFatalOp

struct	LogFunctor

struct	LogitFunctor

class	LogitGradientOp

class	LogMeanExpRangeReducer

class	LogMeanExpRangeReducer< T, CPUContext >

struct	LogMeanExpRangeReducerDef

class	LogMeanExpRangeReducerGradient

class	LogSumExpRangeReducer

class	LogSumExpRangeReducer< T, CPUContext >

struct	LogSumExpRangeReducerDef

class	LogSumExpRangeReducerGradient

class	LpNormGradientOp

class	LpNormOp

struct	LpPoolFunctor

class	LRNGradientOp

class	LRNOp

class	LRNOpBase

class	LSTMUnitDNNLowPOp

class	LSTMUnitGradientOp

class	LSTMUnitOp

struct	MakeAligned

class	MakeTwoClassGradientOp

class	MakeTwoClassOp

class	MapDeserializer

class	MapSerializer

class	MapToKeyValueOp

struct	MapTypeTraits

class	MarginRankingCriterionGradientOp

class	MarginRankingCriterionOp

class	MatMulOp

class	MaxGradientOp

class	MaxOp

struct	MaxPoolFunctor

class	MaxPoolGradientRTCOp

class	MaxPoolRTCOp

class	MaxPoolWithIndexGradientOp

class	MaxPoolWithIndexOp

class	MaxRangeReducer

class	MaxRangeReducer< T, CPUContext >

struct	MaxRangeReducerDef

class	MaxRangeReducerGradient

class	MaxReduceDimsGradientOp

class	MaxReduceDimsOp

class	MaxReducer

class	MaxReducer< T, CPUContext >

struct	MaxReducerDef

class	MaxReducerGradient

class	MaxReductionGradientOp

class	MaxReductionOp

class	MeanGradientOp

class	MeanOp

class	MeanRangeReducer

class	MeanRangeReducer< T, CPUContext >

struct	MeanRangeReducerDef

class	MeanRangeReducerGradient

class	MeanReducer

class	MeanReducer< T, CPUContext >

struct	MeanReducerDef

class	MeanReducerGradient

class	MergeDimOp

class	MergeIdListsOp

class	MergeMultiListFeatureTensorsOp

class	MergeMultiListOrMapFeatureTensorsGradientOp

class	MergeMultiMapFeatureTensorsOp

class	MergeMultiScalarFeatureTensorsGradientOp

class	MergeMultiScalarFeatureTensorsOp

class	MergeSingleListFeatureTensorsOp

class	MergeSingleListOrMapFeatureTensorsGradientOp

class	MergeSingleMapFeatureTensorsOp

class	MergeSingleScalarFeatureTensorsGradientOp

class	MergeSingleScalarFeatureTensorsOp

class	MinGradientOp

class	MinOp

struct	MinReducer

class	MIOPENActivationGradientOp

class	MIOPENActivationOp

class	MIOPENActivationOpBase

class	MIOPENState

class	miopenTensorDescWrapper
	miopenTensorDescWrapper is the placeholder that wraps around a miopenTensorDescriptor_t, allowing us to do descriptor change as-needed during runtime. More...

class	miopenTypeWrapper
	miopenTypeWrapper is a wrapper class that allows us to refer to the miopen type in a template function. More...

class	miopenTypeWrapper< at::Half >

class	miopenTypeWrapper< float >

struct	MIOPENWorkspace
	MIOPENWorkspace is a wrapper around a raw cuda pointer that holds the miopen scratch space. More...

class	MIOPENWrapper
	MIOPENWrapper is a class that wraps the miopen handles and miopen workspaces. More...

class	ModOp

class	ModuleSchema
	A module schema that can be used to store specific information about different modules. More...

class	MomentsGradientOp

class	MomentsOp

class	MomentumSGDOp

class	MomentumSGDUpdateOp

class	MPICommonWorldWrapper
	A simple wrapper over an MPI common world. More...

class	MPIDataTypeWrapper

struct	MPSCNNContext

class	MSRAFillOp

class	MulDNNLowPOp

struct	MulFunctor

class	MultiClassAccuracyOp

class	MutexDeserializer

class	MutexSerializer

class	NanCheckOp

class	NCHW2NHWCOp

class	NegateGradientOp

struct	NegativeFunctor

class	NetBase

class	NetObserverReporter

class	NetObserverReporterPrint

class	NGramFromCategoricalOp

class	NHWC2NCHWOp

class	NNApi

class	NNPACKConvOp

class	NoDefaultEngineOp
	A helper class to denote that an op does not have a default engine. More...

class	NoGradient
	A helper class to indicate that the operator does not need gradient computation. More...

class	NormalizeGradientOp

class	NormalizeL1Op

class	NormalizeOp

struct	NotFunctor

class	NumpyTileOp

class	Observable
	Inherit to make your class observable. More...

class	ObserverBase
	Use this to implement a Observer using the Observer Pattern template. More...

class	ObserverConfig

class	OneHotOp

class	OnnxifiOp

class	OnnxifiTransformer

struct	OnnxifiTransformerOptions

class	ONNXWhileOp

class	Operator

class	OperatorAttachingNetObserver

class	OperatorBase

class	OpSchema
	A class to record the schema of an op. More...

class	OpSchemaRegistry
	A registry to hold all the operator schemas. More...

struct	OpTask
	Data structure for a scheduled task in the task queue. More...

class	OptimizationPass

class	OpWrapper
	Wrap a floating-point operator with quantized inputs with type T. More...

class	OutputMinMaxNetObserver

class	OutputMinMaxObserver

class	PackedInt8BGRANHWCToNCHWCStylizerPreprocessOp

class	PackRNNSequenceOpBase

class	PackSegmentsOp

class	PadEmptySamplesOp

class	PadImageGradientOp

class	PadImageOp

class	PairWiseLossGradientOp

class	PairWiseLossOp

class	ParallelNet

class	ParallelNetExecutorHelper

class	Params

class	PartitionOp

class	PartitionOpBase

class	PatternNetTransform
	PatternNetTransform allows you to create transforms using a simple interface. More...

class	PercentileOp

class	PerfNetObserver

class	PerfOperatorObserver

struct	PerformanceInformation

class	PerplexityOp

class	PiecewiseLinearTransformOp

class	PolyLearningRate

class	PoolGradientOp

class	PoolOp

class	PowOp

class	Predictor

struct	PredictorConfig
	Stores parameters nessasary for creating a PredictorInterface object. More...

class	PrefetchOperator

class	PReluGradientOp

class	PReluOp

class	PrependDimOp

class	PrintOp

class	ProfDAGCounters
	A simple wrapper around prof_dag's counters. More...

class	ProfDAGReport

class	ProfDAGStats

class	ProfileCounter

class	ProfileObserver

class	ProfileOperatorObserver

class	PSRoIPoolGradientOp

class	PSRoIPoolOp

class	QConvOp

struct	QConvState

class	QTensor

class	QTensorDeserializer

class	QTensorSerializer

class	QuantDecodeGradientOp

class	QuantDecodeOp

class	QuantDecompZstdOp

class	QuantizeDNNLowPOp

class	RangeFillOp

class	RangeOp

class	RebatchingQueue

struct	ReciprocalFunctor

struct	ReciprocalGradientFunctor

class	RecurrentBaseOp

class	RecurrentGradientOp

class	RecurrentNetworkBlobFetcherOp

class	RecurrentNetworkExecutorBase
	RecurrentNetworkExecutor is a specialized runtime for recurrent neural networks (RNNs). More...

class	RecurrentNetworkGradientOp

class	RecurrentNetworkOp

class	RecurrentOp

class	RecurrentParamAccessOp

class	RedisStoreHandler

class	RedisStoreHandlerCreateOp

class	ReduceGradientOp

class	ReduceOp

class	ReduceTailSumOp

class	RegisterQuantizationParamsNetObserver
	Set quantization parameters of operators based on min/max collected from OutputMinMaxObserver. More...

class	RegisterQuantizationParamsWithHistogramNetObserver
	Set quantization parameters of operators based on min/max collected from OutputMinMaxObserver. More...

class	ReluDNNLowPOp

struct	ReluFunctor

struct	ReluGradientFunctor

struct	ReluNFunctor

struct	ReluNGradientFunctor

class	RemoveDataBlocksOp

class	RemovePaddingOp

class	ReplaceNaNOp

class	ResetCounterOp

class	ReshapeOp

class	ResizeLikeOp

class	ResizeNearestDNNLowPOp

class	ResizeNearestGradientOp

class	ResizeNearestOp

class	RetrieveCountOp

class	ReversePackedSegsOp

class	RMACRegionsOp

class	RmsPropOp

class	RNNApplyLinkOp

struct	RNNNetOperator
	Struct for operator in a timestep and its dependenceis. More...

class	RoIAlignGradientOp

class	RoIAlignOp

class	RoIAlignRotatedGradientOp

class	RoIAlignRotatedOp

class	RoIPoolFGradientOp

class	RoIPoolFOp

class	RoIPoolGradientOp

class	RoIPoolOp

class	RowMulOp

class	Rowwise8BitQuantizedToFloatOp

class	RowWiseSparseAdagradOp

class	RowWiseSparseAdamOp

struct	RsqrtFunctor

struct	RsqrtGradientFunctor

class	RunCountNetObserver

class	RunCountOperatorObserver

class	SafeDequeueBlobsOp

class	SafeEnqueueBlobsOp

struct	SameTypeAsInput

class	SampleAsGradientOp

class	SampleAsOp

struct	SampleInterval

class	SaveOp

class	ScaleOp

class	ScatterAssignOp
	Update slices of the tensor in-place by overriding. More...

class	ScatterWeightedSumOp
	Update slices of the tensor in-place with weighted sum. More...

class	SegmentIdsToLengthsOp

class	SegmentIdsToRangesOp

class	SegmentOneHotOp

struct	SegmentOpGetGradient

class	SelectGradientOpBase

class	SelectSmoothL1LossGradientOp

class	SelectSmoothL1LossOp

class	SeluGradientOp

class	SeluOp

class	SequenceMaskOp

struct	ShapeInfo

class	ShapeOp

class	SigmoidCrossEntropyLossGradientOp

class	SigmoidCrossEntropyLossOp

class	SigmoidCrossEntropyWithLogitsGradientOp

class	SigmoidCrossEntropyWithLogitsOp

class	SigmoidFocalLossGradientOp

class	SigmoidFocalLossOp

class	SigmoidFunctor

struct	SigmoidGradientFunctor

class	SignalHandler

struct	SignFunctor

struct	SimpleArray

class	SimpleNet

class	SimpleQueue

class	SimpleRefCountNet

struct	SinFunctor

class	SingleOpTransform
	Single Op Transform Base class. More...

struct	SinGradientFunctor

struct	SinhFunctor

struct	SinhGradientFunctor

class	SinusoidPositionEncodingOp

class	SizeOp

class	SkipIndices

class	SkipIndices<>

class	SliceGradientOp

class	SliceOp

class	SmartTensorPrinter

class	SmoothL1LossGradientOp

class	SmoothL1LossOp

class	SNPEOp

class	SoftmaxFocalLossGradientOp

class	SoftmaxFocalLossOp

class	SoftmaxGradientOp

class	SoftmaxOp

class	SoftmaxWithLossGradientOp

class	SoftmaxWithLossOp

class	SoftplusGradientOp

class	SoftplusOp

struct	SoftsignFunctor

struct	SoftsignGradientFunctor

class	SpaceBatchOpBase

class	SpaceToBatchOp

class	SparseAdadeltaOp

class	SparseAdagradOp

class	SparseAdamOp

class	SparseFtrlOp

class	SparseFunHashGradientOp

class	SparseFunHashOp

class	SparseLengths8BitsRowwiseOp

class	SparseLengthsFused8BitRowwiseOp

class	SparseMatrixReshapeOp

class	SparseMomentumSGDUpdateOp

class	SparseNormalizeOp

class	SparseToDenseMaskBase

class	SparseToDenseMaskGradientOp

class	SparseToDenseMaskOp

class	SparseToDenseOp

class	SparseWngradOp

class	SpatialBNDNNLowPOp
	Note this implementation assumes SCALE, BIAS, EST_MEAN, and EST_VAR inputs are still in fp32, so is epsilon argument. More...

class	SpatialBNGradientOp

class	SpatialBNOp

class	SpatialNarrowAsGradient

class	SpatialNarrowAsGradientOp

class	SpatialNarrowAsOp

class	SpatialSoftmaxWithLossGradientOp

class	SpatialSoftmaxWithLossOp

class	SplitByLengthsOp

class	SplitOp

struct	SqrFunctor

struct	SqrtFunctor

class	SquaredL2DistanceGradientOp

class	SquaredL2DistanceOp

class	SquareRootDivideOp

class	SqueezeOp

struct	Stat

struct	StaticLinkingProtector

class	StaticStat

class	StatRegistry
	Holds a map of atomic counters keyed by name. More...

class	StatRegistryCreateOp

class	StatRegistryExportOp

class	StatRegistryUpdateOp

class	StatValue

class	StdDevExportedStat

class	StepLearningRate

class	StopGradientOp

struct	StopOnSignal

class	StoreAddOp

class	StoreGetOp

class	StoreHandler

struct	StoreHandlerNotAvailableException

struct	StoreHandlerTimeoutException

class	StoreSetOp

class	StoreWaitOp

class	StringDeserializer
	StringDeserializer is the deserializer for Strings. More...

class	StringJoinOp

struct	StringProvider

class	StringSerializer
	StringSerializer is the serializer for String. More...

class	StumpFuncIndexOp

class	StumpFuncOp

struct	SubFunctor

class	SumDNNLowPOp

class	SumElementsGradientOp

class	SumElementsIntOp

class	SumElementsOp

class	SummarizeOp

class	SumOp

class	SumRangeReducer

class	SumRangeReducer< T, CPUContext >

struct	SumRangeReducerDef

class	SumRangeReducerGradient

class	SumReduceDimsGradientOp

class	SumReduceDimsOp

class	SumReduceLikeOp

class	SumReducer

class	SumReducer< T, CPUContext >

struct	SumReducerDef

class	SumReducerGradient

class	SumReluOp

class	SumSqrElementsOp

struct	SwishFunctor

class	SwishGradientOp

struct	TanFunctor

struct	TanGradientFunctor

class	TanhFunctor

struct	TanhGradientFunctor

struct	Task

struct	TemplatePutOp

class	Tensor
	Tensor class holds a shared pointer to the implementation TensorImpl, redirects API calls to TensorImpl; Copying of Tensor results in sharing the same underlying implementation object. More...

class	TensorDeserializer
	TensorDeserializer is the deserializer for Tensors. More...

class	TensorFiller

class	TensorPrinter

class	TensorProtosDBInput

class	TensorSerializer
	TensorSerializer is the serializer for Tensors. More...

struct	TensorTypes

struct	TensorTypes2

struct	TextFileReaderInstance

class	TextFileReaderReadOp

class	ThreadedRecurrentNetworkExecutor

class	ThreadLocalCUDAObjects
	A struct to host thread-local cuda objects. More...

class	ThreadPool

class	ThresholdedReluGradientOp

class	ThresholdedReluOp

class	ThrowChildThreadExceptionOp

class	ThrowExceptionOp

struct	ThrowInTheTowelIfGradientIsCalled
	A helper class to indicate that the operator should have no gradient. More...

class	TileGradientOp

class	TileOp

class	TimeCounter

class	TimeObserver

class	TimeOperatorObserver

class	Timer
	A simple timer object for measuring time. More...

struct	TimerBeginOp

struct	TimerEndOp

struct	TimerGetAndEndOp

struct	TimerGetOp

class	TimerInstance

struct	Token

class	TokenizedString

class	Tokenizer

class	TopKGradientOp

class	TopKOp

class	Transform
	The Transform Base Object. More...

class	TransposeOp

class	TTContractionGradientOp

class	TTContractionOp

class	TTLinearGradientOp

class	TTLinearOp

class	TTPadGradientOp

class	TTPadOp

class	TypeIdentifier
	A type id is a unique id for a given C++ type. More...

class	TypeMeta
	TypeMeta is a thin class that allows us to store the type of a container such as a blob, or the data type of a tensor, with a unique run-time id. More...

struct	TypeNameTraits

struct	TypeNameTraits< int32_t >

struct	TypeNameTraits< int64_t >

class	UnaryElementwiseWithArgsDNNLowPOp

class	UnaryElementwiseWithArgsOp

struct	UnaryFunctorWithDefaultCtor

class	UniformFillOp

class	UniqueOp
	Deduplicates input indices vector and optionally produces reverse remapping. More...

class	UniqueUniformFillOp

class	UnpackSegmentsOp

class	UnsupportedOperatorFeature

class	UpsampleBilinearGradientOp

class	UpsampleBilinearOp

class	UpsampleNearestGradientOp

class	UpsampleNearestOp

class	VariableLengthSequencePaddingOp

class	VideoDecoder

class	VideoInputOp

class	VideoIOContext

struct	VideoMeta

class	WallClockTimeOp

class	WeightedMultiSamplingOp

class	WeightedSampleDequeueBlobsOp

class	WeightedSampleOp

class	WeightedSigmoidCrossEntropyWithLogitsGradientOp

class	WeightedSigmoidCrossEntropyWithLogitsOp

class	WeightedSumGradientOp

class	WeightedSumOp

class	WeightedSumReducer

class	WeightedSumReducer< T, CPUContext >

struct	WeightedSumReducerDef

class	WeightedSumReducerGradient

class	WhereOp

class	WhileOp

class	WngradOp

class	Worker

class	WorkersPool

class	Workspace
	Workspace is a class that holds all the related objects created during runtime: (1) all blobs, and (2) all instantiated networks. More...

class	WorkspaceOptimizationPass

class	XavierFillOp

class	YellowFinOp

class	ZeroGradientOp

class	ZmqContext

class	ZmqMessage

class	ZmqSocket

Typedefs
template<typename Key , typename Value >
using	CaffeMap = std::map< Key, Value >

using	CUDAGuard = c10::cuda::CUDAGuard

using	TensorCUDA = Tensor

typedef void(*	EventCreateFunction) (const DeviceOption &option, Event *)

typedef void(*	EventRecordFunction) (Event , const void , const char *)

typedef void(*	EventWaitFunction) (const Event , void )

typedef void(*	EventFinishFunction) (const Event *)

typedef EventStatus(*	EventQueryFunction) (const Event *)

typedef const std::string &(*	EventErrorMessageFunction) (const Event *)

typedef void(*	EventSetFinishedFunction) (const Event , const char )

typedef void(*	EventResetFunction) (Event *)

typedef std::function< void()>	EventCallbackFunction

typedef void(*	EventSetCallbackFunction) (Event *, EventCallbackFunction)

typedef ObserverBase< NetBase >	NetObserver

typedef std::function< std::unique_ptr< NetObserver >NetBase *)>	NetObserverCreator

typedef ObserverBase< OperatorBase >	OperatorObserver

typedef c10::Registry< std::string, std::unique_ptr< OperatorBase >, const OperatorDef &, Workspace * > (	RegistryFunction) ()

using	EnginePrefType = std::vector< std::string >

using	PerOpEnginePrefType = CaffeMap< DeviceType, CaffeMap< std::string, EnginePrefType >>

using	GlobalEnginePrefType = CaffeMap< DeviceType, EnginePrefType >

typedef std::function< bool(int)>	ShouldContinue

using	ExportedStatList = std::vector< ExportedStatValue >
	Holds names and values of counters exported from a StatRegistry.

using	ExportedStatMap = std::unordered_map< std::string, int64_t >

using	StorageImpl = at::StorageImpl

using	Storage = at::Storage

using	TensorCPU = Tensor

typedef TypeMeta(*	TypeCall) (const void *)

typedef vector< int64_t >(*	TensorInfoCall) (const void , size_t capacity, DeviceOption *device)

template<typename T >
using	deleted_unique_ptr = std::unique_ptr< T, std::function< void(T *)>>

using	ParallelFor = std::function< void(size_t, std::function< void(size_t)>)>

using	NumericTypes = TensorTypes< int32_t, int64_t, float, double >

using	IntTypes = TensorTypes< int32_t, int64_t >

using	BoolTypes = TensorTypes< bool >

using	IntBoolTypes = TensorTypes< int32_t, int64_t, bool >

template<typename InputTypes , class Context , class Functor , class OutputTypeMap = SameTypeAsInput>
using	UnaryElementwiseOp = UnaryElementwiseWithArgsOp< InputTypes, Context, UnaryFunctorWithDefaultCtor< Functor >, OutputTypeMap >

template<typename InputTypes , class Context , class Functor , class TypeMap = SameTypeAsInput>
using	BinaryElementwiseOp = BinaryElementwiseWithArgsOp< InputTypes, Context, BinaryFunctorWithDefaultCtor< Functor >, TypeMap >

template<typename InputTypes , class Context , class Functor , class OutputTypeMap = SameTypeAsInput, class GradientTypeMap = SameTypeAsInput>
using	BinaryElementwiseGradientOp = BinaryElementwiseWithArgsGradientOp< InputTypes, Context, BinaryFunctorWithDefaultCtor< Functor >, OutputTypeMap, GradientTypeMap >

using	SparseLengthsSumOp = CPUSparseLengthsReductionOp< float, TensorTypes< float, at::Half >, 0, 0 >

using	SparseLengthsWeightedSumOp = CPUSparseLengthsReductionOp< float, TensorTypes< float, at::Half >, 1, 0 >

using	SparseLengthsMeanOp = CPUSparseLengthsReductionOp< float, TensorTypes< float, at::Half >, 0, 1 >

using	SparseLengthsSumDef = AbstractSparseLengthsDef< float, int, CPUContext, SumReducerDef, true >

using	SparseLengthsWeightedSumDef = AbstractSparseLengthsDef< float, int, CPUContext, WeightedSumReducerDef, true >

using	op = core.CreateOperator("Save",["X","Y","Z"],[], db="test_db2", db_type="leveldb", blob_name_overrides=["x_scores","y_scores","z_scores"]) workspace.FeedBlob("X", np.random.randint(20, size=(5, 5))) workspace.FeedBlob("Y", np.random.randint(20, size=(5, 5))) workspace.FeedBlob("Z", np.random.randint(20, size=(5, 5))) workspace.RunOperatorOnce(op)```</details >) DOC") .Arg( "absolute_path", "*(type:int

using	MapType64To64 = MapTypeTraits< int64_t, int64_t >::MapType

using	MapType64To32 = MapTypeTraits< int64_t, int32_t >::MapType

using	MapType32To32 = MapTypeTraits< int32_t, int32_t >::MapType

using	MapType32To64 = MapTypeTraits< int32_t, int64_t >::MapType

using	GPUFallbackOp = GPUFallbackOpEx< SkipIndices<>>

template<typename ScalarFunctor , typename TypeMap = FixedType<std::string>>
using	StringElementwiseOp = UnaryElementwiseWithArgsOp< TensorTypes< std::string >, CPUContext, ForEach< ScalarFunctor >, TypeMap >

using	ShapeInfoMap = std::unordered_map< std::string, ShapeInfo >

using	PredictorParameters = std::map< std::string, std::shared_ptr< Blob >>

using	DeviceType = at::DeviceType

using	BatchPermutationFP32Op = CopyOp< CPUContext, CPUContext, CPUContext >

using	ConvFp32Op = ConvOp< float, CPUContext >

using	AddFp32Op = BinaryElementwiseOp< NumericTypes, CPUContext, AddFunctor< CPUContext >>

using	ElementwiseLinearFp32Op = ElementwiseLinearOp< float, CPUContext >

using	MulFp32Op = BinaryElementwiseOp< NumericTypes, CPUContext, MulFunctor< CPUContext >>

using	FCFp32Op = FullyConnectedOp< CPUContext >

using	GroupNormFP32Op = GroupNormOp< float, CPUContext >

using	ResizeNearestFP32Op = ResizeNearestOp< float, CPUContext >

using	RebatchingQueuePtr = std::unique_ptr< RebatchingQueue >

template<typename T >
using	EigenMatrixMap = Eigen::Map< Eigen::Matrix< T, Eigen::Dynamic, Eigen::Dynamic >>

template<typename T >
using	EigenArrayMap = Eigen::Map< Eigen::Array< T, Eigen::Dynamic, Eigen::Dynamic >>

template<typename T >
using	EigenVectorMap = Eigen::Map< Eigen::Matrix< T, Eigen::Dynamic, 1 >>

template<typename T >
using	EigenVectorArrayMap = Eigen::Map< Eigen::Array< T, Eigen::Dynamic, 1 >>

template<typename T >
using	ConstEigenMatrixMap = Eigen::Map< const Eigen::Matrix< T, Eigen::Dynamic, Eigen::Dynamic >>

template<typename T >
using	ConstEigenArrayMap = Eigen::Map< const Eigen::Array< T, Eigen::Dynamic, Eigen::Dynamic >>

template<typename T >
using	ConstEigenVectorMap = Eigen::Map< const Eigen::Matrix< T, Eigen::Dynamic, 1 >>

template<typename T >
using	ConstEigenVectorArrayMap = Eigen::Map< const Eigen::Array< T, Eigen::Dynamic, 1 >>

using	EigenOuterStride = Eigen::OuterStride< Eigen::Dynamic >

using	EigenInnerStride = Eigen::InnerStride< Eigen::Dynamic >

using	EigenStride = Eigen::Stride< Eigen::Dynamic, Eigen::Dynamic >

template<typename T >
using	EigenOuterStridedMatrixMap = Eigen::Map< Eigen::Matrix< T, Eigen::Dynamic, Eigen::Dynamic >, 0, EigenOuterStride >

template<typename T >
using	EigenOuterStridedArrayMap = Eigen::Map< Eigen::Array< T, Eigen::Dynamic, Eigen::Dynamic >, 0, EigenOuterStride >

template<typename T >
using	ConstEigenOuterStridedMatrixMap = Eigen::Map< const Eigen::Matrix< T, Eigen::Dynamic, Eigen::Dynamic >, 0, EigenOuterStride >

template<typename T >
using	ConstEigenOuterStridedArrayMap = Eigen::Map< const Eigen::Array< T, Eigen::Dynamic, Eigen::Dynamic >, 0, EigenOuterStride >

template<typename T >
using	EigenStridedMatrixMap = Eigen::Map< Eigen::Matrix< T, Eigen::Dynamic, Eigen::Dynamic >, 0, EigenStride >

template<typename T >
using	EigenStridedArrayMap = Eigen::Map< Eigen::Array< T, Eigen::Dynamic, Eigen::Dynamic >, 0, EigenStride >

template<typename T >
using	ConstEigenStridedMatrixMap = Eigen::Map< const Eigen::Matrix< T, Eigen::Dynamic, Eigen::Dynamic >, 0, EigenStride >

template<typename T >
using	ConstEigenStridedArrayMap = Eigen::Map< const Eigen::Array< T, Eigen::Dynamic, Eigen::Dynamic >, 0, EigenStride >

template<typename T >
using	EArrXt = Eigen::Array< T, Eigen::Dynamic, 1 >

using	EArrXf = Eigen::ArrayXf

using	EArrXd = Eigen::ArrayXd

using	EArrXi = Eigen::ArrayXi

using	EArrXb = EArrXt< bool >

template<typename T >
using	EArrXXt = Eigen::Array< T, Eigen::Dynamic, Eigen::Dynamic >

using	EArrXXf = Eigen::ArrayXXf

template<typename T >
using	ERArrXXt = Eigen::Array< T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor >

using	ERArrXXf = ERArrXXt< float >

template<typename T >
using	EVecXt = Eigen::Matrix< T, Eigen::Dynamic, 1 >

using	EVecXd = Eigen::VectorXd

using	EVecXf = Eigen::VectorXf

using	ERVecXd = Eigen::RowVectorXd

using	ERVecXf = Eigen::RowVectorXf

template<typename T >
using	EMatXt = Eigen::Matrix< T, Eigen::Dynamic, Eigen::Dynamic >

using	EMatXd = Eigen::MatrixXd

using	EMatXf = Eigen::MatrixXf

template<typename T >
using	ERMatXt = Eigen::Matrix< T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor >

using	ERMatXd = ERMatXt< double >

using	ERMatXf = ERMatXt< float >

Enumerations
enum	CudaMemoryPoolType { NONE = 0, CUB = 1, THC = 2 }

enum	EventStatus { EVENT_INITIALIZED = 0, EVENT_SCHEDULED = 1, EVENT_SUCCESS = 2, EVENT_FAILED = 3 }

enum	StorageOrder { UNKNOWN = 0, NHWC = 1, NCHW = 2 }

enum	{ ALGO_FWD = 0, ALGO_WGRAD = 1, ALGO_DGRAD = 2 }

enum	PadMode { CONSTANT = 0, REFLECT = 1, EDGE = 2 }

enum	QuantDecodeRunTy { RUN_ALWAYS, RUN_ONCE }

enum	RecurrentParamOpMode { SET_PARAM, GET_PARAM, SET_PARAM, GET_PARAM }

enum	RecurrentParamOpMode { SET_PARAM, GET_PARAM, SET_PARAM, GET_PARAM }

enum	FillerDistribution { FD_UNIFORM, FD_FIXEDSUM, FD_SYNTHETIC }

enum	FLowAlgType { FarnebackOpticalFlow = 0, DensePyrLKOpticalFlow = 1, BroxOpticalFlow = 2, OpticalFlowDual_TVL1 = 3 }

enum	FlowDataType { Flow2C = 0, Flow3C = 1, FlowWithGray = 2, FlowWithRGB = 3 }

enum	SpecialFps { SAMPLE_NO_FRAME = 0, SAMPLE_ALL_FRAMES = -1, SAMPLE_TIMESTAMP_ONLY = -2 }

enum	VideoResType { USE_WIDTH_HEIGHT = 0, USE_MINIMAL_WIDTH_HEIGHT = 1, ORIGINAL_RES = 2 }

enum	DecodeType { DO_TMP_JITTER = 0, DO_UNIFORM_SMP = 1, USE_START_FRM = 2 }

Functions
void	swap (Blob &lhs, Blob &rhs)

std::ostream &	operator<< (std::ostream &out, const Blob &v)

void	reportTime (std::string type, double ts, std::string metric, std::string unit)

void	splitSizes (const std::string &arg, int ptr0, int ptr1)

cv::Mat	resizeImage (cv::Mat &img)

cv::Mat	cropToRec (cv::Mat &img, int height_ptr, int width_ptr)

std::vector< float >	convertToVector (cv::Mat &img)

std::vector< float >	convertOneImage (std::string &filename, int height_ptr, int width_ptr)

int	getBatchSize (int num_items)

TensorProtos	writeValues (std::vector< std::vector< std::vector< float >>> &values, std::vector< std::vector< int >> &dims)

TensorProtos	convertImages (std::string &image_file)

template<class TYPE >
vector< TYPE >	splitString (std::string &line)

TensorProtos	convertValues (std::string &file_name)

void	ConvertToRawDataset (const string &input_db_name, const string &output_db_name)

void	writeValues (std::vector< std::vector< std::vector< float >>> &values, std::vector< std::vector< int >> &dims, std::string output_file)

void	convertImages ()

void	convertValues ()

void	ReadImage (std::ifstream file, int label, char *buffer)

void	WriteToDB (const string &filename, const int num_items, const int &offset, db::DB *db)

void	ConvertCIFAR ()

void	ConvertImageDataset (const string &input_folder, const string &list_filename, const string &output_db_name, const bool)

uint32_t	swap_endian (uint32_t val)

void	convert_dataset (const char image_filename, const char label_filename, const char *db_path, const int data_limit)

void	run ()

	CAFFE_KNOWN_TYPE (TypeMetaTestFoo)

	CAFFE_KNOWN_TYPE (TypeMetaTestBar)

	CAFFE_KNOWN_TYPE (ClassAllowAssignment)

	CAFFE_KNOWN_TYPE (ClassNoAssignment)

template<>
C10_EXPORT const detail::TypeMetaData *	TypeMeta::_typeMetaDataInstance< detail::_Uninitialized > () noexcept

	CAFFE_DEFINE_PREALLOCATED_KNOWN_TYPE (25, detail::_guard_long_unique< long >)

	CAFFE_DEFINE_PREALLOCATED_KNOWN_TYPE (26, detail::_guard_long_unique< std::vector< long >>)

bool	operator< (TypeIdentifier lhs, TypeIdentifier rhs)

std::ostream &	operator<< (std::ostream &stream, caffe2::TypeIdentifier typeId)

bool	operator== (const TypeMeta &lhs, const TypeMeta &rhs) noexcept

bool	operator!= (const TypeMeta &lhs, const TypeMeta &rhs) noexcept

std::ostream &	operator<< (std::ostream &stream, caffe2::TypeMeta typeMeta)

bool	BlobIsTensorType (const Blob &blob, DeviceType device_type)

Tensor *	BlobSetTensor (Blob *blob, Tensor &&tensor)

Tensor	GetSizedTensorWithOptions (Tensor &&previous_tensor, at::IntArrayRef dims, at::TensorOptions options)

Tensor *	BlobGetMutableTensor (Blob *blob, at::IntArrayRef dims, at::TensorOptions options)

Tensor	XBlobGetMutableTensor (Blob *blob, at::IntArrayRef dims, at::TensorOptions options)

Tensor *	BlobGetMutableTensor (Blob *blob, DeviceType device_type)

const Tensor &	BlobGetTensor (const Blob &blob, DeviceType device_type)

Tensor	BlobGetTensorOrUndefined (const Blob &blob)

void	SerializeBlob (const Blob &blob, const string &name, BlobSerializerBase::SerializationAcceptor acceptor, int chunk_size=kDefaultChunkSize)
	Serializes the given blob, if possible. More...

std::string	SerializeBlob (const Blob &blob, const string &name)
	Convenience function to serialize a blob to a string. More...

int	GetGPUIDForPointer (const void *ptr)
	Gets the GPU id that the current pointer is located at.

	C10_DEFINE_TYPED_REGISTRY (BlobSerializerRegistry, TypeIdentifier, BlobSerializerBase, std::unique_ptr)

	C10_DEFINE_REGISTRY (BlobDeserializerRegistry, BlobDeserializerBase)

void	DeserializeBlob (const string &content, Blob *result)
	Deserializes from a string containing either BlobProto or TensorProto. More...

void	DeserializeBlob (const BlobProto &blob_proto, Blob *result)

Tensor	EmptyTensorFromProto (const TensorProto &tensor_proto)

std::string	SerializeAsString_EnforceCheck (const google::protobuf::MessageLite &msg, const char *error_location)

std::string	SerializeBlobProtoAsString_EnforceCheck (const BlobProto &blob)

	C10_DECLARE_TYPED_REGISTRY (BlobSerializerRegistry, TypeIdentifier, BlobSerializerBase, std::unique_ptr)

unique_ptr< BlobSerializerBase >	CreateSerializer (TypeIdentifier id)

	C10_DECLARE_REGISTRY (BlobDeserializerRegistry, BlobDeserializerBase)

unique_ptr< BlobDeserializerBase >	CreateDeserializer (const string &type)

bool	HasCudaRuntime ()

bool	HasHipRuntime ()

const std::map< string, string > &	GetBuildOptions ()

template<typename T , typename... Args>
std::enable_if<!std::is_array< T >::value, std::unique_ptr< T > >::type	make_unique (Args &&...args)

template<typename T >
std::enable_if< std::is_array< T >::value, std::unique_ptr< T > >::type	make_unique (const size_t n)

template<typename T , typename... Args>
std::enable_if< std::extent< T >::value!=0, std::unique_ptr< T > >::type	make_unique (Args &&...)=delete

template<typename Dst , typename Src >
Dst	dynamic_cast_if_rtti (Src ptr)

size_t	cudnnCompiledVersion ()

size_t	cudnnRuntimeVersion ()

void	CheckCuDNNVersions ()

cudnnTensorFormat_t	GetCudnnTensorFormat (const StorageOrder &order)
	A wrapper function to convert the Caffe storage order to cudnn storage order enum values.

int	NumCudaDevices ()
	Returns the number of devices.

void	SetDefaultGPUID (const int deviceid)

int	GetDefaultGPUID ()

int	CaffeCudaGetDevice ()
	Gets the current GPU id. More...

void	CaffeCudaSetDevice (const int id)
	Gets the current GPU id. More...

const cudaDeviceProp &	GetDeviceProperty (const int device)
	Gets the device property for the given device. More...

void	DeviceQuery (const int deviceid)
	Runs a device query function and prints out the results to LOG(INFO).

bool	GetCudaPeerAccessPattern (vector< vector< bool > > *pattern)

bool	TensorCoreAvailable ()
	Return the availability of TensorCores for math.

const char *	cublasGetErrorString (cublasStatus_t error)
	Return a human readable cublas error string.

const char *	curandGetErrorString (curandStatus_t error)
	Return a human readable curand error string.

int	CudaVersion ()
	A runtime function to report the cuda version that Caffe2 is built with.

bool	HasCudaGPU ()
	Check if the current running session has a cuda gpu present. More...

CAFFE2_CUDA_API bool	GetCudaPeerAccessPattern (vector< vector< bool >> *pattern)
	Return a peer access pattern by returning a matrix (in the format of a nested vector) of boolean values specifying whether peer access is possible. More...

int	CAFFE_GET_BLOCKS (const int N)
	Compute the number of blocks needed to run N threads.

dim3	CAFFE_GET_BLOCKS_2D (const int N, const int)
	Compute the number of blocks needed to run N threads for a 2D grid.

uint32_t	RandomNumberSeed ()
	A function to generate a random number seed that is unique in a best-effort basis, using an ever-incrementing seed and the current time.

CAFFE2_CUDA_API CudaMemoryPoolType	GetCudaMemoryPoolType ()
	Gets the current memory pool type used by Caffe2. More...

	CAFFE_KNOWN_TYPE (db::DBReader)

	CAFFE_KNOWN_TYPE (db::Cursor)

void	EventCreateCPU (const DeviceOption &option, Event *event)

void	EventRecordCPU (Event event, const void , const char *err_msg)

void	EventFinishCPU (const Event *event)

void	EventWaitCPUCPU (const Event event, void )

EventStatus	EventQueryCPU (const Event *event)

const std::string &	EventErrorMessageCPU (const Event *event)

void	EventSetFinishedCPU (const Event event, const char err_msg)

void	EventSetCallbackCPU (Event *event, EventCallbackFunction callback)

void	EventResetCPU (Event *event)

	REGISTER_EVENT_CREATE_FUNCTION (CPU, EventCreateCPU)

	REGISTER_EVENT_RECORD_FUNCTION (CPU, EventRecordCPU)

	REGISTER_EVENT_WAIT_FUNCTION (CPU, CPU, EventWaitCPUCPU)

	REGISTER_EVENT_FINISH_FUNCTION (CPU, EventFinishCPU)

	REGISTER_EVENT_QUERY_FUNCTION (CPU, EventQueryCPU)

	REGISTER_EVENT_ERROR_MESSAGE_FUNCTION (CPU, EventErrorMessageCPU)

	REGISTER_EVENT_SET_FINISHED_FUNCTION (CPU, EventSetFinishedCPU)

	REGISTER_EVENT_RESET_FUNCTION (CPU, EventResetCPU)

	REGISTER_EVENT_SET_CALLBACK_FUNCTION (CPU, EventSetCallbackCPU)

bool	EventCanScheduleCPU (const Event , const Event )

void	EventCreateCUDA (const DeviceOption &option, Event *event)

void	EventRecordCUDA (Event event, const void context, const char *err_msg)

void	EventFinishCUDA (const Event *event)

void	EventWaitCUDACUDA (const Event event, void context)

void	EventWaitCPUCUDA (const Event event, void context)

void	EventWaitCUDACPU (const Event event, void context)

EventStatus	EventQueryCUDA (const Event *event)

const std::string &	EventErrorMessageCUDA (const Event *event)

void	EventSetFinishedCUDA (const Event event, const char err_msg)

void	EventResetCUDA (Event *event)

	REGISTER_EVENT_CREATE_FUNCTION (CUDA, EventCreateCUDA)

	REGISTER_EVENT_RECORD_FUNCTION (CUDA, EventRecordCUDA)

	REGISTER_EVENT_WAIT_FUNCTION (CUDA, CUDA, EventWaitCUDACUDA)

	REGISTER_EVENT_WAIT_FUNCTION (CPU, CUDA, EventWaitCPUCUDA)

	REGISTER_EVENT_WAIT_FUNCTION (CUDA, CPU, EventWaitCUDACPU)

	REGISTER_EVENT_FINISH_FUNCTION (CUDA, EventFinishCUDA)

	REGISTER_EVENT_QUERY_FUNCTION (CUDA, EventQueryCUDA)

	REGISTER_EVENT_ERROR_MESSAGE_FUNCTION (CUDA, EventErrorMessageCUDA)

	REGISTER_EVENT_SET_FINISHED_FUNCTION (CUDA, EventSetFinishedCUDA)

	REGISTER_EVENT_RESET_FUNCTION (CUDA, EventResetCUDA)

	REGISTER_EVENT_WAIT_FUNCTION (MKLDNN, CUDA, EventWaitCPUCUDA)

	REGISTER_EVENT_WAIT_FUNCTION (CUDA, MKLDNN, EventWaitCUDACPU)

OperatorDef *	AddOp (NetDef *netdef_ptr, string op_type, std::vector< string > inputs, std::vector< string > outputs)

bool	MatchStrings (string p, string s)
	This allows for the use of * and \| to match operator types, engines, or any other property that is represented by strings. More...

bool	MatchArguments (const OperatorDef &p_op, const OperatorDef &g_op)
	This ensures that each named arg that exists in the pattern exists in g_op, is equal in value.

size_t	miopenCompiledVersion ()

size_t	miopenRuntimeVersion ()

void	CheckMIOPENVersions ()

bool	GlobalInitAlreadyRun ()
	Determine whether GlobalInit has already been run.

bool	GlobalInit (int pargc, char **argv)
	Initialize the global environment of caffe2. More...

bool	GlobalInit ()
	Initialize the global environment without command line arguments. More...

bool	Caffe2CheckIntrinsicsFeatures (int , char **)

	REGISTER_CAFFE2_INIT_FUNCTION (Caffe2CheckIntrinsicsFeatures,&Caffe2CheckIntrinsicsFeatures,"Check intrinsics compatibility between the CPU feature and the binary.")

const CaffeMap< string, const ModuleSchema * > &	CurrentModules ()
	Current Modules present in the Caffe2 runtime. More...

bool	HasModule (const string &name)
	Checks whether a module is already present in the current binary.

void	LoadModule (const string &name, const string &filename="")
	Load a module. More...

	C10_DEFINE_REGISTRY (NetRegistry, NetBase, const std::shared_ptr< const NetDef > &, Workspace *)

void	AddGlobalNetObserverCreator (NetObserverCreator creator)

void	ClearGlobalNetObservers ()

unique_ptr< NetBase >	CreateNet (const NetDef &net_def, Workspace *ws)
	Creates a network, accessing / creating blobs in the given workspace. More...

unique_ptr< NetBase >	CreateNet (const std::shared_ptr< const NetDef > &net_def, Workspace *ws)

	C10_DECLARE_REGISTRY (NetRegistry, NetBase, const std::shared_ptr< const NetDef > &, Workspace *)

template<class TaskThreadPoolImpl , int device_type>
std::shared_ptr< TaskThreadPoolBase >	GetAsyncNetThreadPool (int device_id, int pool_size, bool create_new)

	REGISTER_NET (async_scheduling, AsyncSchedulingNet)

std::shared_ptr< AsyncTaskGraphBase >	GetAsyncTaskGraph (ExecutorHelper *helper, const ExecutionOptions &options)

	C10_DEFINE_SHARED_REGISTRY (TaskGraphRegistry, AsyncTaskGraphBase, ExecutorHelper *, const ExecutionOptions &)

	C10_REGISTER_CREATOR (TaskGraphRegistry, futures, GetAsyncTaskGraph)

	REGISTER_NET (parallel, ParallelNet)

	C10_DECLARE_SHARED_REGISTRY (TaskGraphRegistry, AsyncTaskGraphBase, ExecutorHelper *, const ExecutionOptions &)

	REGISTER_NET (simple, SimpleNet)

	REGISTER_NET (simple_refcount, SimpleRefCountNet)

const std::string	OpRegistryKey (const std::string &op_type, const std::string &engine)

void	SetPerOpEnginePref (const PerOpEnginePrefType &per_op_engine_pref)

void	SetGlobalEnginePref (const GlobalEnginePrefType &global_engine_pref)

void	SetEnginePref (const PerOpEnginePrefType &per_op_engine_pref, const GlobalEnginePrefType &global_engine_pref)

void	SetOpEnginePref (const std::string &op_type, const CaffeMap< DeviceType, EnginePrefType > &op_pref)

unique_ptr< OperatorBase >	CreateOperator (const OperatorDef &operator_def, Workspace *ws, int net_position)

std::map< DeviceType, OperatorRegistry * > *	gDeviceTypeRegistry ()

	C10_DEFINE_REGISTRY (CPUOperatorRegistry, OperatorBase, const OperatorDef &, Workspace *)

	CAFFE_REGISTER_DEVICE_TYPE (CPU, CPUOperatorRegistry)

	C10_DEFINE_REGISTRY (CUDAOperatorRegistry, OperatorBase, const OperatorDef &, Workspace *)

	CAFFE_REGISTER_DEVICE_TYPE (CUDA, CUDAOperatorRegistry)

	C10_DEFINE_REGISTRY (HIPOperatorRegistry, OperatorBase, const OperatorDef &, Workspace *)

	CAFFE_REGISTER_DEVICE_TYPE (HIP, HIPOperatorRegistry)

	C10_DEFINE_REGISTRY (GradientRegistry, GradientMakerBase, const OperatorDef &, const vector< GradientWrapper > &)

GradientOpsMeta	GetGradientForOp (const OperatorDef &def, const vector< GradientWrapper > &g_output)
	Gets the GradientOpsMeta for the given operator def.

TensorShapes	InferBlobShapesAndTypes (CaffeMap< string, TensorShape > &blob_desc, const vector< NetDef * > &nets)

TensorShape	GetTensorShapeOfBlob (const Blob *b)

TensorShapes	InferBlobShapesAndTypesFromWorkspace (Workspace ws, const vector< NetDef > &nets)

TensorShapes	InferBlobShapesAndTypesFromMap (const CaffeMap< std::string, std::vector< int64_t >> &blob_dimensions, const vector< NetDef * > &nets)

TensorShapes	InferBlobShapesAndTypesFromMap (const CaffeMap< std::string, std::vector< int64_t >> &blob_dimensions, const CaffeMap< std::string, TensorProto_DataType > &blob_types, const vector< NetDef * > &nets)

std::map< string, std::pair< DeviceOption, DeviceOption > >	ValidateTensorDevices (OperatorBase &op, const OperatorDef &op_def)

std::set< std::string >	GetRegisteredOperators ()

void	SetOperatorLogger (std::function< void(const OperatorDef &)> tracer)

std::function< void(const OperatorDef &)>	GetOperatorLogger ()

	C10_DEFINE_TENSOR_TYPES_DISPATCHER (TensorTypes, DoRunWithType, DoRunWithOtherType) C10_DEFINE_TENSOR_TYPES_DISPATCHER(TensorTypes2

	C10_DECLARE_REGISTRY (CPUOperatorRegistry, OperatorBase, const OperatorDef &, Workspace *)

	C10_DECLARE_REGISTRY (CUDAOperatorRegistry, OperatorBase, const OperatorDef &, Workspace *)

	C10_DECLARE_REGISTRY (HIPOperatorRegistry, OperatorBase, const OperatorDef &, Workspace *)

	C10_DEFINE_REGISTRY (C10OperatorRegistry, OperatorBase, const OperatorDef &, Workspace *)

	C10_DECLARE_REGISTRY (GradientRegistry, GradientMakerBase, const OperatorDef &, const vector< GradientWrapper > &)

C10_EXPORT std::ostream &	operator<< (std::ostream &out, const OpSchema &schema)

template<typename T_I = int>
TensorShape	CreateTensorShape (vector< T_I > dims,::caffe2::TensorProto_DataType dt)

vector< int64_t >	GetDimsVector (const TensorShape &shape)

uint64_t	nElemFromDim (const TensorShape &X, int dim=0)

uint64_t	nElemBetweenDim (const TensorShape &X, int start, int stop)

std::pair< std::vector< DeviceOption >, std::vector< DeviceOption > >	InferOpInputOutputDevice (const OperatorDef &op)

template<uint64_t OpsPerPoint>
OpSchema::Cost	PointwiseCostInference (const OperatorDef &, const vector< TensorShape > &inputs)

bool	RunPlanOnWorkspace (Workspace *ws, const PlanDef &plan, ShouldContinue shouldContinue)

	CAFFE_KNOWN_TYPE (QTensor< CPUContext >)

template<typename F >
detail::ScopeGuardImplDecay< F >	MakeGuard (F &&f) noexcept(noexcept(detail::ScopeGuardImplDecay< F >(static_cast< F && >(f))))
	ScopeGuard is a general implementation of the "Initialization is Resource Acquisition" idiom. More...

ExportedStatMap	toMap (const ExportedStatList &stats)

	CAFFE_DEFINE_PREALLOCATED_KNOWN_TYPE (12, Tensor)

TypeMeta	GetTensorType (const void *c)

TypeCall	GetTypeCallFunction (TypeIdentifier id)

void	RegisterTypeCallFunction (TypeIdentifier id, TypeCall c)

vector< int64_t >	GetTensorInfo (const void c, size_t capacity, DeviceOption *device)

TensorInfoCall	GetTensorInfoFunction (TypeIdentifier id)

void	RegisterTensorInfoFunction (TypeIdentifier id, TensorInfoCall c)

void	TensorVectorResize (std::vector< Tensor > &tensors, int size, DeviceType type)

Tensor	empty (at::IntArrayRef dims, at::TensorOptions options)

void	ReinitializeTensor (Tensor *t, at::IntArrayRef dims, at::TensorOptions options)
	Reinitialize a Tensor to given dims and options if necessary, note that this will not do anything if the Tensor already has correct size and data type.

void	ReinitializeAndCopyFrom (Tensor *t, at::TensorOptions options, const Tensor &src, bool async)

template<typename T >
Tensor	TensorCPUFromValues (at::IntArrayRef dims, at::ArrayRef< T > values)
	Creates a CPU tensor, and fills its contents with the given values. More...

	CAFFE_KNOWN_TYPE (int8::Int8TensorCPU)

	C10_DEFINE_REGISTRY (TransformRegistry, Transform)

unique_ptr< Transform >	CreateTransform (string key)

NetDef	ApplyTransform (const string &key, const NetDef &netdef)

double	average_net_run_duration (const NetDef &netdef, const NetDef &init_netdef, const int warmup_runs, const int main_runs)

NetDef	ApplyTransformIfFaster (const string &key, const NetDef &netdef, const NetDef &init_netdef, const int warmup_runs, const int main_runs, const double improvement_threshold)

	C10_DECLARE_REGISTRY (TransformRegistry, Transform)

TensorProto::DataType	TypeMetaToDataType (const TypeMeta &meta)

const TypeMeta &	DataTypeToTypeMeta (const TensorProto::DataType &dt)

StorageOrder	StringToStorageOrder (const string &str)

constexpr char	NameScopeSeparator ()

template<typename T >
bool	fp16_type ()

template<>
bool	fp16_type< at::Half > ()

std::string	GetUniqueName ()

	REGISTER_CPU_OPERATOR (CreateDB, CreateDBOp< CPUContext >)

	OPERATOR_SCHEMA (CreateDB).NumInputs(0).NumOutputs(1)

	NO_GRADIENT (CreateDB)

	REGISTER_CUDA_OPERATOR (CreateDB, CreateDBOp< CUDAContext >)

	REGISTER_CPU_OPERATOR (FileStoreHandlerCreate, FileStoreHandlerCreateOp< CPUContext >)

	NumInputs (0).NumOutputs(1).SetDoc(R"DOC( Creates a unique_ptr<StoreHandler> that uses the filesystem as backing store (typically a filesystem shared between many nodes

such as NFS This store handler is not built to be fast Its recommended use is for integration tests and prototypes where extra dependencies are cumbersome Use an ephemeral path to ensure multiple processes or runs don t interfere DOC	Arg ("path","base path used by the FileStoreHandler").Arg("prefix"

such as NFS This store handler is not built to be fast Its recommended use is for integration tests and prototypes where extra dependencies are cumbersome Use an ephemeral path to ensure multiple processes or runs don t interfere DOC prefix for all keys used by this store	Output (0,"handler","unique_ptr<StoreHandler>")

	NO_GRADIENT (FileStoreHandlerCreateOp)

	REGISTER_CUDA_OPERATOR (FileStoreHandlerCreate, FileStoreHandlerCreateOp< CUDAContext >)

	REGISTER_CPU_OPERATOR (RedisStoreHandlerCreate, RedisStoreHandlerCreateOp< CPUContext >)

host name of Redis server	Arg ("port","port number of Redis server").Arg("prefix"

	NO_GRADIENT (RedisStoreHandlerCreateOp)

	REGISTER_CUDA_OPERATOR (RedisStoreHandlerCreate, RedisStoreHandlerCreateOp< CUDAContext >)

	CAFFE_KNOWN_TYPE (std::unique_ptr< StoreHandler >)

	REGISTER_CPU_OPERATOR (StoreSet, StoreSetOp)

	NumInputs (2).NumOutputs(0).SetDoc(R"DOC( Set a blob in a store. The key is the input blob's name and the value is the data in that blob. The key can be overridden by specifying the 'blob_name' argument. )DOC").Arg("blob_name" = Ai * Bi

alternative key for the	blob (optional)") .Input(0

alternative key for the unique_ptr< StoreHandler >	Input (1,"data","data blob")

	REGISTER_CPU_OPERATOR (StoreGet, StoreGetOp)

	NumInputs (1).NumOutputs(1).SetDoc(R"DOC( Get a blob from a store. The key is the output blob's name. The key can be overridden by specifying the 'blob_name' argument. )DOC").Arg("blob_name"

alternative key for the unique_ptr< StoreHandler >	Output (0,"data","data blob")

	REGISTER_CPU_OPERATOR (StoreAdd, StoreAddOp)

the store initializes it to and then performs the add operation The operation returns the resulting counter value DOC	Arg ("blob_name","key of the counter (required)").Arg("add_value"

the store initializes it to and then performs the add operation The operation returns the resulting counter value DOC value that is	added (optional, default:1)") .Input(0

the store initializes it to and then performs the add operation The operation returns the resulting counter value DOC value that is unique_ptr< StoreHandler >	Output (0,"value","the current value of the counter")

	REGISTER_CPU_OPERATOR (StoreWait, StoreWaitOp)

	NumInputs (1, 2).NumOutputs(0).SetDoc(R"DOC( Wait for the specified blob names to be set. The blob names can be passed either as an input blob with blob names or as an argument. )DOC").Arg("blob_names"

names of the blobs to wait	for (optional)") .Input(0

names of the blobs to wait unique_ptr< StoreHandler >	Input (1,"names","names of the blobs to wait for (optional)")

	REGISTER_CPU_OPERATOR (FC_Decomp, FullyConnectedOpDecomp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (FCGradient_Decomp, FullyConnectedDecompGradientOp< float, CPUContext >)

	OPERATOR_SCHEMA (FC_Decomp).NumInputs(4).NumOutputs(1)

	OPERATOR_SCHEMA (FCGradient_Decomp).NumInputs(4).NumOutputs(3

	REGISTER_GRADIENT (FC_Decomp, GetFCDecompGradient)

	REGISTER_CUDA_OPERATOR (FC_Decomp, FullyConnectedOpDecomp< float, CUDAContext >)

	REGISTER_CUDA_OPERATOR (FCGradient_Decomp, FullyConnectedDecompGradientOp< float, CUDAContext >)

	REGISTER_CPU_OPERATOR (TTContraction, TTContractionOp< float, CPUContext >)

	REGISTER_CUDA_OPERATOR (TTContraction, TTContractionOp< float, CUDAContext >)

	REGISTER_CUDA_OPERATOR (TTContractionGradient, TTContractionGradientOp< float, CUDAContext >)

void	adam_ideep_update (int N, const float g, const float m, const float v, float ng, float nm, float nv, float beta1, float beta2, float eps_hat, float correction, const float *lr)

void	adam_ideep_compute (int N, const float w, const float g, const float m, const float v, float nw, float nm, float nv, float beta1, float beta2, float eps_hat, float correction, const float lr)

void	adam_ideep_compute_output_grad (int N, const float w, const float g, const float m, const float v, float nw, float nm, float nv, float ng, float beta1, float beta2, float eps_hat, float correction, const float *lr)

	REGISTER_IDEEP_OPERATOR (Adam, IDEEPAdamOp< float >)

	REGISTER_IDEEP_OPERATOR (ChannelShuffle, ChannelShuffleOp)

	REGISTER_IDEEP_OPERATOR (ChannelShuffleGradient, ChannelShuffleGradientOp)

	REGISTER_IDEEP_OPERATOR (Concat, IDEEPConcatOp)

	REGISTER_IDEEP_OPERATOR (Split, IDEEPSplitOp)

	REGISTER_IDEEP_OPERATOR (ConvFusion, IDEEPConvFusionOp)

std::function< void(OpSchema &)>	ConvFusionDocGenerator (const char *dim)

	REGISTER_IDEEP_OPERATOR (Conv, IDEEPConvOp)

	REGISTER_IDEEP_OPERATOR (ConvGradient, IDEEPConvGradientOp)

	REGISTER_IDEEP_OPERATOR (ConvTranspose, IDEEPConvTransposeOp)

	REGISTER_IDEEP_OPERATOR (ConvTransposeGradient, IDEEPConvTransposeGradientOp)

	REGISTER_IDEEP_OPERATOR (Dropout, IDEEPDropoutOp)

	REGISTER_IDEEP_OPERATOR (DropoutGrad, IDEEPDropoutGradientOp)

	REGISTER_IDEEP_OPERATOR (Sum, IDEEPSumOp)

	REGISTER_IDEEP_OPERATOR (Add, IDEEPSumOp)

	REGISTER_IDEEP_OPERATOR (ExpandDims, IDEEPExpandDimsOp)

	REGISTER_IDEEP_OPERATOR (Squeeze, IDEEPSqueezeOp)

	USE_IDEEP_DEF_ALIASES ()

	REGISTER_IDEEP_OPERATOR (FC, IDEEPFullyConnectedOp)

	REGISTER_IDEEP_OPERATOR (FCGradient, IDEEPFullyConnectedGradientOp)

	REGISTER_IDEEP_OPERATOR (LRN, IDEEPLRNOp)

	REGISTER_IDEEP_OPERATOR (LRNGradient, IDEEPLRNGradientOp)

void	momentum_sgd_update (const int N, const float g, const float m, float ng, float nm, const float lr, const float momentum, const bool nesterov, float param)

	REGISTER_IDEEP_OPERATOR (MomentumSGD, IDEEPMomentumSGDOp)

	REGISTER_IDEEP_OPERATOR (MomentumSGDUpdate, IDEEPMomentumSGDUpdateOp)

	REGISTER_IDEEP_COMPARE_OPERATOR (EQ)

	REGISTER_IDEEP_COMPARE_OPERATOR (GT)

	REGISTER_IDEEP_COMPARE_OPERATOR (GE)

	REGISTER_IDEEP_COMPARE_OPERATOR (LT)

	REGISTER_IDEEP_COMPARE_OPERATOR (LE)

	REGISTER_IDEEP_COMPARE_OPERATOR (NE)

	REGISTER_IDEEP_OPERATOR (Softmax, IDEEPFallbackOp< SoftmaxOp< float, CPUContext >>)

	REGISTER_IDEEP_OPERATOR (LabelCrossEntropy, IDEEPFallbackOp< LabelCrossEntropyOp< float, CPUContext >>)

	REGISTER_IDEEP_OPERATOR (AveragedLoss, IDEEPFallbackOp< AveragedLoss< float, CPUContext >, SkipIndices< 0 >>)

	REGISTER_IDEEP_OPERATOR (Flatten, IDEEPFallbackOp< FlattenOp< CPUContext >>)

	REGISTER_IDEEP_OPERATOR (ResizeLike, IDEEPFallbackOp< ResizeLikeOp< CPUContext >>)

	REGISTER_IDEEP_OPERATOR (Transpose, IDEEPFallbackOp< TransposeOp< CPUContext >>)

	REGISTER_IDEEP_OPERATOR (Slice, IDEEPFallbackOp< SliceOp< CPUContext >>)

	REGISTER_IDEEP_OPERATOR (Clip, IDEEPFallbackOp< ClipOp< float, CPUContext >>)

	REGISTER_IDEEP_OPERATOR (ScatterAssign, IDEEPFallbackOp< ScatterAssignOp< CPUContext >>)

	REGISTER_IDEEP_OPERATOR (Cast, IDEEPFallbackOp< CastOp< CPUContext >>)

	REGISTER_IDEEP_OPERATOR (XavierFill, IDEEPFallbackOp< XavierFillOp< float, CPUContext >>)

	REGISTER_IDEEP_OPERATOR (ConstantFill, IDEEPFallbackOp< ConstantFillOp< CPUContext >>)

	REGISTER_IDEEP_OPERATOR (GaussianFill, IDEEPFallbackOp< GaussianFillOp< float, CPUContext >>)

	REGISTER_IDEEP_OPERATOR (MSRAFill, IDEEPFallbackOp< MSRAFillOp< float, CPUContext >>)

	REGISTER_IDEEP_OPERATOR (GivenTensorFill, IDEEPFallbackOp< GivenTensorFillOp< float, CPUContext >>)

	REGISTER_IDEEP_OPERATOR (GivenTensorDoubleFill, IDEEPFallbackOp< GivenTensorFillOp< double, CPUContext >, SkipIndices< 0 >>)

	REGISTER_IDEEP_OPERATOR (GivenTensorBoolFill, IDEEPFallbackOp< GivenTensorFillOp< bool, CPUContext >, SkipIndices< 0 >>)

	REGISTER_IDEEP_OPERATOR (GivenTensorIntFill, IDEEPFallbackOp< GivenTensorFillOp< int, CPUContext >, SkipIndices< 0 >>)

	REGISTER_IDEEP_OPERATOR (GivenTensorInt64Fill, IDEEPFallbackOp< GivenTensorFillOp< int64_t, CPUContext >, SkipIndices< 0 >>)

	REGISTER_IDEEP_OPERATOR (GivenTensorStringFill, IDEEPFallbackOp< GivenTensorFillOp< std::string, CPUContext >, SkipIndices< 0 >>)

	REGISTER_IDEEP_OPERATOR (Load, IDEEPFallbackOp< LoadOp< CPUContext >>)

	REGISTER_IDEEP_OPERATOR (Save, IDEEPFallbackOp< SaveOp< CPUContext >>)

	REGISTER_IDEEP_OPERATOR (RMACRegions, IDEEPFallbackOp< RMACRegionsOp< CPUContext >>)

	REGISTER_IDEEP_OPERATOR (RoIPool, IDEEPFallbackOp< RoIPoolOp< float, CPUContext >>)

	REGISTER_IDEEP_OPERATOR (RoIAlign, IDEEPFallbackOp< RoIAlignOp< float, CPUContext >>)

	REGISTER_IDEEP_OPERATOR (RoIAlignRotated, IDEEPFallbackOp< RoIAlignRotatedOp< float, CPUContext >>)

	REGISTER_IDEEP_OPERATOR (GenerateProposals, IDEEPFallbackOp< GenerateProposalsOp< CPUContext >>)

	REGISTER_IDEEP_OPERATOR (GenerateProposalsCPP, IDEEPFallbackOp< GenerateProposalsOp< CPUContext >>)

	REGISTER_IDEEP_OPERATOR (CollectAndDistributeFpnRpnProposals, IDEEPFallbackOp< CollectAndDistributeFpnRpnProposalsOp< CPUContext >>)

	REGISTER_IDEEP_OPERATOR (BoxWithNMSLimit, IDEEPFallbackOp< BoxWithNMSLimitOp< CPUContext >, SkipIndices< 0, 1, 2 >>)

	REGISTER_IDEEP_OPERATOR (BBoxTransform, IDEEPFallbackOp< BBoxTransformOp< float, CPUContext >>)

	REGISTER_IDEEP_OPERATOR (AffineChannel, IDEEPFallbackOp< AffineChannelOp< float, CPUContext >>)

	REGISTER_IDEEP_OPERATOR (StopGradient, IDEEPFallbackOp< StopGradientOp< CPUContext >>)

	REGISTER_IDEEP_OPERATOR (PadImage, IDEEPFallbackOp< PadImageOp< float, CPUContext >>)

	REGISTER_IDEEP_OPERATOR (PRelu, IDEEPFallbackOp< PReluOp< float, CPUContext >>)

	REGISTER_IDEEP_OPERATOR (CTCGreedyDecoder, IDEEPFallbackOp< CTCGreedyDecoderOp< CPUContext >>)

	REGISTER_IDEEP_OPERATOR (CTCBeamSearchDecoder, IDEEPFallbackOp< CTCBeamSearchDecoderOp< CPUContext >>)

	REGISTER_IDEEP_OPERATOR (AveragedLossGradient, IDEEPFallbackOp< AveragedLossGradient< float, CPUContext >>)

	REGISTER_IDEEP_OPERATOR (LabelCrossEntropyGradient, IDEEPFallbackOp< LabelCrossEntropyGradientOp< float, CPUContext >>)

	REGISTER_IDEEP_OPERATOR (SoftmaxGradient, IDEEPFallbackOp< SoftmaxGradientOp< float, CPUContext >>)

	REGISTER_IDEEP_OPERATOR (Iter, IDEEPFallbackOp< IterOp< CPUContext >>)

	REGISTER_IDEEP_OPERATOR (LearningRate, IDEEPFallbackOp< LearningRateOp< float, CPUContext >>)

	REGISTER_IDEEP_OPERATOR (Abs, IDEEPFallbackOp< UnaryElementwiseOp< TensorTypes< float >, CPUContext, AbsFunctor< CPUContext >>>)

	REGISTER_IDEEP_OPERATOR (Atan, IDEEPFallbackOp< UnaryElementwiseOp< TensorTypes< float >, CPUContext, AtanFunctor< CPUContext >>>)

	REGISTER_IDEEP_OPERATOR (Sqrt, IDEEPFallbackOp< UnaryElementwiseOp< TensorTypes< float >, CPUContext, SqrtFunctor< CPUContext >>>)

	REGISTER_IDEEP_OPERATOR (Div, IDEEPFallbackOp< BinaryElementwiseOp< NumericTypes, CPUContext, DivFunctor< CPUContext >>>)

	REGISTER_IDEEP_OPERATOR (Mul, IDEEPFallbackOp< BinaryElementwiseOp< NumericTypes, CPUContext, MulFunctor< CPUContext >>>)

	REGISTER_IDEEP_OPERATOR (Sub, IDEEPFallbackOp< BinaryElementwiseOp< NumericTypes, CPUContext, SubFunctor< CPUContext >>>)

	REGISTER_IDEEP_OPERATOR (Tanh, IDEEPFallbackOp< UnaryElementwiseOp< TensorTypes< float >, CPUContext, TanhFunctor< CPUContext >>>)

	REGISTER_IDEEP_OPERATOR (L1Distance, IDEEPFallbackOp< L1DistanceOp< float, CPUContext >>)

	REGISTER_IDEEP_OPERATOR (Scale, IDEEPFallbackOp< ScaleOp< CPUContext >>)

	REGISTER_IDEEP_OPERATOR (Accuracy, IDEEPFallbackOp< AccuracyOp< float, CPUContext >>)

	REGISTER_IDEEP_OPERATOR (AddGradient, IDEEPFallbackOp< BinaryElementwiseGradientOp< NumericTypes, CPUContext, AddFunctor< CPUContext >>>)

	REGISTER_IDEEP_OPERATOR (TanhGradient, IDEEPFallbackOp< BinaryElementwiseOp< TensorTypes< float >, CPUContext, TanhGradientFunctor< CPUContext >>>)

	REGISTER_IDEEP_OPERATOR (MulGradient, IDEEPFallbackOp< BinaryElementwiseGradientOp< NumericTypes, CPUContext, MulFunctor< CPUContext >>>)

	REGISTER_IDEEP_OPERATOR (TensorProtosDBInput, IDEEPFallbackOp< TensorProtosDBInput< CPUContext >>)

	REGISTER_IDEEP_OPERATOR (CloseBlobsQueue, IDEEPFallbackOp< CloseBlobsQueueOp< CPUContext >>)

	REGISTER_IDEEP_OPERATOR (SoftmaxWithLoss, IDEEPFallbackOp< SoftmaxWithLossOp< float, CPUContext >>)

	REGISTER_IDEEP_OPERATOR (SoftmaxWithLossGradient, IDEEPFallbackOp< SoftmaxWithLossGradientOp< float, CPUContext >>)

	REGISTER_IDEEP_OPERATOR (NHWC2NCHW, IDEEPFallbackOp< NHWC2NCHWOp< float, CPUContext >>)

	REGISTER_IDEEP_OPERATOR (NCHW2NHWC, IDEEPFallbackOp< NCHW2NHWCOp< float, CPUContext >>)

	REGISTER_IDEEP_OPERATOR (Expand, IDEEPFallbackOp< ExpandOp< TensorTypes< std::int32_t, std::int64_t, float, double >, CPUContext >>)

	REGISTER_IDEEP_OPERATOR (Gather, IDEEPFallbackOp< GatherOp< CPUContext >>)

	REGISTER_IDEEP_OPERATOR (Normalize, IDEEPFallbackOp< NormalizeOp< float, CPUContext >>)

	REGISTER_IDEEP_OPERATOR (ReduceL2, IDEEPFallbackOp< ReduceOp< TensorTypes< float >, CPUContext, L2Reducer< CPUContext >>>)

	REGISTER_IDEEP_OPERATOR (ReduceSum, IDEEPFallbackOp< ReduceOp< TensorTypes< std::int32_t, std::int64_t, float, double >, CPUContext, SumReducer< CPUContext >>>)

	REGISTER_IDEEP_OPERATOR (ReduceMean, IDEEPFallbackOp< ReduceOp< TensorTypes< float >, CPUContext, MeanReducer< CPUContext >>>)

	REGISTER_IDEEP_OPERATOR (BatchMatMul, IDEEPFallbackOp< BatchMatMulOp< CPUContext >>)

	REGISTER_IDEEP_OPERATOR (MaxPool, IDEEPPoolOp)

	REGISTER_IDEEP_OPERATOR (MaxPoolGradient, IDEEPPoolGradientOp)

	REGISTER_IDEEP_OPERATOR (AveragePool, IDEEPPoolOp)

	REGISTER_IDEEP_OPERATOR (AveragePoolGradient, IDEEPPoolGradientOp)

	REGISTER_IDEEP_OPERATOR (CreateBlobsQueue, IDEEPCreateBlobsQueueOp)

	SHOULD_NOT_DO_GRADIENT (IDEEPCreateBlobsQueueOp)

	REGISTER_IDEEP_OPERATOR (SafeEnqueueBlobs, IDEEPSafeEnqueueBlobsOp)

	SHOULD_NOT_DO_GRADIENT (IDEEPSafeEnqueueBlobsOp)

	REGISTER_IDEEP_OPERATOR (Relu, IDEEPReluOp)

	REGISTER_IDEEP_OPERATOR (ReluGradient, IDEEPReluGradientOp)

	REGISTER_IDEEP_OPERATOR (LeakyRelu, IDEEPReluOp)

	REGISTER_IDEEP_OPERATOR (LeakyReluGradient, IDEEPReluGradientOp)

	REGISTER_IDEEP_OPERATOR (Reshape, IDEEPReshapeOp)

	REGISTER_IDEEP_OPERATOR (Shape, IDEEPShapeOp)

	REGISTER_IDEEP_OPERATOR (Sigmoid, IDEEPSigmoidOp)

	REGISTER_IDEEP_OPERATOR (SigmoidGradient, IDEEPSigmoidGradientOp)

	REGISTER_IDEEP_OPERATOR (SpatialBN, IDEEPSpatialBNOp)

	REGISTER_IDEEP_OPERATOR (CopyCPUToIDEEP, CopyCPUToIDEEPOp)

	REGISTER_IDEEP_OPERATOR (CopyIDEEPToCPU, CopyIDEEPToCPUOp)

	REGISTER_IDEEP_OPERATOR (Copy, IDEEPCopyOp)

	REGISTER_IDEEP_OPERATOR (WeightedSum, IDEEPWeightedSumOp)

The input TensorCPU to copy	Output (0,"ideep_blob","The output IDEEP tensort to copy to")

The input IDEEP tensort to copy	Output (0,"cpu_blob","The output TensorCPU to copy to")

	C10_DECLARE_REGISTRY (IDEEPOperatorRegistry, OperatorBase, const OperatorDef &, Workspace *)

	CAFFE_KNOWN_TYPE (ideep::tensor)

	C10_DEFINE_REGISTRY (IDEEPOperatorRegistry, OperatorBase, const OperatorDef &, Workspace *)

	CAFFE_REGISTER_DEVICE_TYPE (DeviceType::IDEEP, IDEEPOperatorRegistry)

	REGISTER_EVENT_CREATE_FUNCTION (IDEEP, EventCreateCPU)

	REGISTER_EVENT_RECORD_FUNCTION (IDEEP, EventRecordCPU)

	REGISTER_EVENT_WAIT_FUNCTION (IDEEP, IDEEP, EventWaitCPUCPU)

	REGISTER_EVENT_WAIT_FUNCTION (IDEEP, CPU, EventWaitCPUCPU)

	REGISTER_EVENT_WAIT_FUNCTION (CPU, IDEEP, EventWaitCPUCPU)

	REGISTER_EVENT_FINISH_FUNCTION (IDEEP, EventFinishCPU)

	REGISTER_EVENT_QUERY_FUNCTION (IDEEP, EventQueryCPU)

	REGISTER_EVENT_ERROR_MESSAGE_FUNCTION (IDEEP, EventErrorMessageCPU)

	REGISTER_EVENT_SET_FINISHED_FUNCTION (IDEEP, EventSetFinishedCPU)

	REGISTER_EVENT_RESET_FUNCTION (IDEEP, EventResetCPU)

	REGISTER_CPU_OPERATOR (ImageInput, ImageInputOp< CPUContext >)

	NumInputs (0, 1).NumOutputs(2

INT_MAX	TensorInferenceFunction ([](const OperatorDef &def, const vector< TensorShape > &){vector< TensorShape > out(2);ArgumentHelper helper(def);int batch_size=helper.GetSingleArgument< int >("batch_size", 0);int crop=helper.GetSingleArgument< int >("crop",-1);int color=helper.GetSingleArgument< int >("color", 1);CHECK_GT(crop, 0);out[0]=CreateTensorShape(vector< int >{batch_size, crop, crop, color?3:1}, TensorProto::FLOAT);out[1]=CreateTensorShape(vector< int >{1, batch_size}, TensorProto::INT32);return out;}).SetDoc(R"DOC( Imports and processes images from a database. For each run of the operator

INT_MAX batch_size images will be processed GPUs can optionally be used for part of the processing The following transformations are applied to the image A bounding box is applied to the initial	image (optional)-The image is rescaled either up or down(with the scale argument) or just up(with the minsize argument)-The image is randomly cropped(crop size is passed as an argument but the location of the crop is random except if is_test is passed in which case the image in cropped at the center)-The image is normalized.Each of its color channels can have separate normalization values The dimension of the output image will always be cropxcrop) DOC") .Arg("batch_size"

INT_MAX batch_size images will be processed GPUs can optionally be used for part of the processing The following transformations are applied to the image A bounding box is applied to the initial Number of images to output for each run of the	operator" ".Must be 1 or greater") .Arg ("color","Number of color channels (1 or 3). Defaults to 1").Arg("color_jitter"

INT_MAX batch_size images will be processed GPUs can optionally be used for part of the processing The following transformations are applied to the image A bounding box is applied to the initial Number of images to output for each run of the Whether or not to do color jitter Defaults to	Arg ("img_saturation","Image saturation scale used in color jittering. ""Defaults to 0.4").Arg("img_brightness"

INT_MAX batch_size images will be processed GPUs can optionally be used for part of the processing The following transformations are applied to the image A bounding box is applied to the initial Number of images to output for each run of the Whether or not to do color jitter Defaults to Image brightness scale used in color jittering Defaults to	Arg ("img_contrast","Image contrast scale used in color jittering. ""Defaults to 0.4").Arg("color_lighting"

INT_MAX batch_size images will be processed GPUs can optionally be used for part of the processing The following transformations are applied to the image A bounding box is applied to the initial Number of images to output for each run of the Whether or not to do color jitter Defaults to Image brightness scale used in color jittering Defaults to Whether or not to do color lighting Defaults to	Arg ("color_lighting_std","Std of normal distribution where color lighting"" scaling factor is sampled. Defaults to 0.1").Arg("scale_jitter_type"

INT_MAX batch_size images will be processed GPUs can optionally be used for part of the processing The following transformations are applied to the image A bounding box is applied to the initial Number of images to output for each run of the Whether or not to do color jitter Defaults to Image brightness scale used in color jittering Defaults to Whether or not to do color lighting Defaults to Scale the size of the smallest dimension of the image to this Scale and minsize are mutually exclusive Must be larger than crop	Arg ("minsize","Scale the size of the smallest dimension of the image to"" this only if the size is initially smaller. Scale and minsize are"" mutually exclusive. Must be larger than crop.").Arg("warp"

the other dimension is proportionally scaled Defaults to	Arg ("crop","Size to crop the image to. Must be provided").Arg("mirror"

the other dimension is proportionally scaled Defaults to Whether or not to mirror the image Defaults to	Arg ("mean","Mean by which to normalize color channels."" Defaults to 0.").Arg("mean_per_channel"

the other dimension is proportionally scaled Defaults to Whether or not to mirror the image Defaults to Vector of means per color	channel (1 or 3 elements).Defaults to mean argument.Channel order BGR") .Arg("std"

the other dimension is proportionally scaled Defaults to Whether or not to mirror the image Defaults to Vector of means per color Standard deviation by which to normalize color channels Defaults to	Arg ("std_per_channel","Vector of standard dev. per color channel "" (1 or 3 elements). Defaults to std argument. Channel order is BGR").Arg("bounding_ymin"

the other dimension is proportionally scaled Defaults to Whether or not to mirror the image Defaults to Vector of means per color Standard deviation by which to normalize color channels Defaults to Bounding box coordinate Defaults	to (none)") .Arg("bounding_xmin"

the other dimension is proportionally scaled Defaults to Whether or not to mirror the image Defaults to Vector of means per color Standard deviation by which to normalize color channels Defaults to Bounding box coordinate Defaults Bounding box coordinate Defaults Bounding box coordinate Defaults Bounding box coordinate Defaults if the input is in Caffe format Defaults to	Arg ("use_gpu_transform","1 if GPU acceleration should be used."" Defaults to 0. Can only be 1 in a CUDAContext").Arg("decode_threads"

the other dimension is proportionally scaled Defaults to Whether or not to mirror the image Defaults to Vector of means per color Standard deviation by which to normalize color channels Defaults to Bounding box coordinate Defaults Bounding box coordinate Defaults Bounding box coordinate Defaults Bounding box coordinate Defaults if the input is in Caffe format Defaults to Number of CPU decode transform threads Defaults to	Arg ("output_type","If gpu_transform, can set to FLOAT or FLOAT16.").Arg("db"

the other dimension is proportionally scaled Defaults to Whether or not to mirror the image Defaults to Vector of means per color Standard deviation by which to normalize color channels Defaults to Bounding box coordinate Defaults Bounding box coordinate Defaults Bounding box coordinate Defaults Bounding box coordinate Defaults if the input is in Caffe format Defaults to Number of CPU decode transform threads Defaults to Name of the	database (if not passed as input)") .Arg("db_type"

the other dimension is proportionally scaled Defaults to Whether or not to mirror the image Defaults to Vector of means per color Standard deviation by which to normalize color channels Defaults to Bounding box coordinate Defaults Bounding box coordinate Defaults Bounding box coordinate Defaults Bounding box coordinate Defaults if the input is in Caffe format Defaults to Number of CPU decode transform threads Defaults to Name of the Type of The sizes of any outputs besides the data and	label (should have a number of elements equal to the number of additional" "outputs)") .Arg("random_scale"

the other dimension is proportionally scaled Defaults to Whether or not to mirror the image Defaults to Vector of means per color Standard deviation by which to normalize color channels Defaults to Bounding box coordinate Defaults Bounding box coordinate Defaults Bounding box coordinate Defaults Bounding box coordinate Defaults if the input is in Caffe format Defaults to Number of CPU decode transform threads Defaults to Name of the Type of The sizes of any outputs besides the data and shortest side desired for image resize Defaults to[-1,-1] or no random resize desired	Input (0,"reader","The input reader (a db::DBReader)").Output(0

the other dimension is proportionally scaled Defaults to Whether or not to mirror the image Defaults to Vector of means per color Standard deviation by which to normalize color channels Defaults to Bounding box coordinate Defaults Bounding box coordinate Defaults Bounding box coordinate Defaults Bounding box coordinate Defaults if the input is in Caffe format Defaults to Number of CPU decode transform threads Defaults to Name of the Type of The sizes of any outputs besides the data and shortest side desired for image resize Defaults to[-1,-1] or no random resize desired Tensor containing the images	Output (1,"label","Tensor containing the labels").Output(2

	NO_GRADIENT (ImageInput)

template<class Context >
bool	RandomSizedCropping (cv::Mat img, const int crop, std::mt19937 randgen)

template<class Context >
void	Saturation (float img, const int img_size, const float alpha_rand, std::mt19937 randgen)

template<class Context >
void	Brightness (float img, const int img_size, const float alpha_rand, std::mt19937 randgen)

template<class Context >
void	Contrast (float img, const int img_size, const float alpha_rand, std::mt19937 randgen)

template<class Context >
void	ColorJitter (float img, const int img_size, const float saturation, const float brightness, const float contrast, std::mt19937 randgen)

template<class Context >
void	ColorLighting (float img, const int img_size, const float alpha_std, const std::vector< std::vector< float >> &eigvecs, const std::vector< float > &eigvals, std::mt19937 randgen)

template<class Context >
void	ColorNormalization (float *img, const int img_size, const int channels, const std::vector< float > &mean, const std::vector< float > &std)

template<class Context >
void	TransformImage (const cv::Mat &scaled_img, const int channels, float image_data, const bool color_jitter, const float saturation, const float brightness, const float contrast, const bool color_lighting, const float color_lighting_std, const std::vector< std::vector< float >> &color_lighting_eigvecs, const std::vector< float > &color_lighting_eigvals, const int crop, const bool mirror, const std::vector< float > &mean, const std::vector< float > &std, std::mt19937 randgen, std::bernoulli_distribution *mirror_this_image, bool is_test=false)

template<class Context >
void	CropTransposeImage (const cv::Mat &scaled_img, const int channels, uint8_t cropped_data, const int crop, const bool mirror, std::mt19937 randgen, std::bernoulli_distribution *mirror_this_image, bool is_test=false)

	REGISTER_CUDA_OPERATOR (ImageInput, ImageInputOp< CUDAContext >)

template<typename T_IN , typename T_OUT , class Context >
bool	TransformOnGPU (Tensor &X, Tensor Y, Tensor &mean, Tensor &std, Context context)

bool	tryConvertToMPSCNN (const NetDef &initNet, const NetDef &predictNet, NetDef *mpscnnPredictNet)

NetDef	annotateDefWithReadCounts (const NetDef &net)

NetDef	rewriteForMetal (const NetDef &net)

NetDef	runMPSCNNFusion (const NetDef &net)

void	dumpDef (const NetDef &d)

void	mpscnnRecordExecutionFinish ()

MPSCNNContext &	getMPSCNNContext ()

bool	tryConvertToMPSCNNIntermediateCopies (const NetDef &initNet, const NetDef &predictNet, NetDef *mpscnnPredictNet)

NetDef	setSpecialArgs (const NetDef &def)

void	testMPSCNN ()

void	compareModels (const NetDef &initNet, NetDef predictNet)

void	verifyRewrite (const NetDef &initNet, const NetDef &net, std::vector< int > inputDims)

std::string &	gSNPELocation ()

	REGISTER_CPU_OPERATOR (SNPE, SNPEOp)

void	uniformQuantize2b1b (const TensorCPU &X, const std::vector< std::unique_ptr< TensorCPU >> &XQ, float offset, float inter_center_distance)

void	qconv (const ConvArgs &args, const TensorCPU &X, const TensorCPU &W, const TensorCPU b, TensorCPU Y)

void	qpad_zero (const ConvArgs &args, const TensorCPU &X, TensorCPU *Y)

void	signQuantize (const TensorCPU &X, TensorCPU *XQ)

void	filterNormalization11 (const TensorCPU &WQ, TensorCPU *WQN)

void	filterNormalizationL1 (const TensorCPU &W, TensorCPU *WL1)

void	qim2col (const ConvArgs &args, const TensorCPU &XQ, const TensorCPU &WQ, TensorCPU *XQcol)

std::unique_ptr< QConvState >	create2b1bConvState (Workspace ws, const TensorCPU &W, const TensorCPU b)

void	run2b1bConvGeneric (QConvState state, const ConvArgs &args, const TensorCPU &X, TensorCPU Y)

void	run2b1bUnification (QConvState state, size_t N, size_t C, const float WQNVdata, const float YQs0Vdata, const float YQs1Vdata, size_t YQstride, float Ydata, size_t Ystride, const float bias)

	REGISTER_CPU_OPERATOR (QConv, QConvOp)

size_t	divRoundUp (size_t x, size_t d)

bool	run2b1bConvNeon (QConvState state, const ConvArgs &args, const TensorCPU &X, TensorCPU Y)

	CAFFE_KNOWN_TYPE (MPICommonWorldWrapper)

std::mutex &	MPIMutex ()

MPI_Comm	GlobalMPIComm ()
	Gets the global MPI communicator used by Caffe2. More...

void	SetGlobalMPIComm (MPI_Comm new_comm)
	Sets the global MPI communicator. More...

int	MPICommSize (MPI_Comm comm)
	A helper function to return the size of the given communicator.

int	MPICommRank (MPI_Comm comm)
	A helper function to return the rank of the given communicator.

void	MPISetupPeers (const int replicas, const string &role, const string &job_path)
	A function used to perform peer setup so one does not need to use mpirun / mpiexec to run the binary. More...

void	CheckInitializedMPI ()

	REGISTER_CPU_OPERATOR (Abs, UnaryElementwiseOp< TensorTypes< float >, CPUContext, AbsFunctor< CPUContext >>)

	REGISTER_CPU_OPERATOR (AbsGradient, BinaryElementwiseOp< TensorTypes< float >, CPUContext, AbsGradientFunctor< CPUContext >>)

element wise Github workspace FeedBlob("X", np.random.randn(5).astype(np.float32)) print("X	OPERATOR_SCHEMA (AbsGradient).NumInputs(2).NumOutputs(1).IdenticalTypeAndShape()

	REGISTER_GRADIENT (Abs, GetAbsGradient)

	REGISTER_CPU_OPERATOR (Accumulate, AccumulateOp< float, CPUContext >)

we first initialize the output tensor to all and then do accumulation Any further calls to the	operator, given that no one else fiddles with the output in the interim, will do simple accumulations.Accumulation is done using Axpby operation as shown:Y=1 X+gamma Y where X is the input tensor, Y is the output tensor and gamma is the multiplier argument.) DOC") .Arg ("gamma","(float, default 1.0) Accumulation multiplier").Input(0

we first initialize the output tensor to all and then do accumulation Any further calls to the The input tensor that has to be accumulated to the output tensor If the output size is not the same as input the output tensor is first reshaped and initialized to and only accumulation is done	Output (0,"output","Accumulated output tensor")

	SHOULD_NOT_DO_GRADIENT (Accumulate)

	REGISTER_CPU_OPERATOR (Accuracy, AccuracyOp< float, CPUContext >)

NumInputs(2).NumOutputs(1).ScalarType(TensorProto	SHOULD_NOT_DO_GRADIENT (Accuracy)

	REGISTER_CPU_OPERATOR (Acos, UnaryElementwiseOp< TensorTypes< float >, CPUContext, AcosFunctor< CPUContext >>)

	REGISTER_CPU_OPERATOR (AcosGradient, BinaryElementwiseOp< TensorTypes< float >, CPUContext, AcosGradientFunctor< CPUContext >>)

element wise DOC	Input (0,"input","Input tensor").Output(0

	REGISTER_GRADIENT (Acos, GetAcosGradient)

	REGISTER_CPU_OPERATOR (AdjustBatch, AdjustBatchOp< CPUContext >)

	Input (0,"Input","Input data").Input(1

Real batch size	Output (0,"Output","Data with Adjusted batch size").Output(1

Real batch size Real batah size	Arg ("max_batch_size","(int): max batch size").SetDoc(R"DOC( Adjust the batch size of `input` tensor. When we only have 1 input

	REGISTER_CPU_OPERATOR (AffineChannel, AffineChannelOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (AffineChannelGradient, AffineChannelGradientOp< float, CPUContext >)

	NumInputs (3).NumOutputs(1).AllowInplace(

	SetDoc (R"DOC( Applies a separate affine transformation to each channel of the input. Useful for replacing spatial batch norm with its equivalent fixed transformation. )DOC").Input(0

Feature map input with order NCHW or NHWC	Input (1,"scale","1D input of shape (C); the c-th element is the scale factor of the ""affine transformation for the c-th channel of the input.").Input(2

Feature map input with order NCHW or NHWC input of	shape (C)

the c th element is the bias of the affine transformation for the c th channel of the input	Output (0,"Y","Output with the same order of Input.")

	NumInputs ({2, 3}).NumOutputs(

	AllowInplace ({{0, 0}})

	REGISTER_GRADIENT (AffineChannel, GetAffineChannelGradient)

	REGISTER_CPU_OPERATOR (ArgMax, ArgOp< CPUContext, ArgMaxReducer< CPUContext >>)

	REGISTER_CPU_OPERATOR (ArgMin, ArgOp< CPUContext, ArgMinReducer< CPUContext >>)

	REGISTER_CPU_OPERATOR (Asin, UnaryElementwiseOp< TensorTypes< float >, CPUContext, AsinFunctor< CPUContext >>)

	REGISTER_CPU_OPERATOR (AsinGradient, BinaryElementwiseOp< TensorTypes< float >, CPUContext, AsinGradientFunctor< CPUContext >>)

	REGISTER_GRADIENT (Asin, GetAsinGradient)

	REGISTER_CPU_OPERATOR (Assert, AssertOp< CPUContext >)

	REGISTER_CPU_OPERATOR (Atan, UnaryElementwiseOp< TensorTypes< float >, CPUContext, AtanFunctor< CPUContext >>)

	REGISTER_CPU_OPERATOR (AtanGradient, BinaryElementwiseOp< TensorTypes< float >, CPUContext, AtanGradientFunctor< CPUContext >>)

	REGISTER_GRADIENT (Atan, GetAtanGradient)

	REGISTER_CPU_OPERATOR (BatchBucketize, BatchBucketizeOp< CPUContext >)

	NumInputs (4).NumOutputs(1).SetDoc(R"DOC( Bucketize the float_features into sparse features. The float_features is a N * D tensor where N is the batch_size

and D is the feature_dim The indices is a tensor containing the indices of the features that need to be bucketized The lengths is a tensor that splits the following boundaries argument The boundaries is a tensor containing the border list for each feature With in each indices should not have duplicate and the number of elements in indices should be less than or euqal to D Each element in lengths	vector (lengths[`i`]) represents the number of boundaries in the sub border list.The sum of all elements in`lengths`must be equal to the size of`boundaries`.If lengths[0]

	REGISTER_CPU_OPERATOR (BatchGather, BatchGatherOp< CPUContext >)

	REGISTER_CPU_OPERATOR (BatchGatherGradient, BatchGatherGradientOp< CPUContext >)

	SetDoc (R"DOC( Batch gather operation, first dimension in DATA is the batch size. Given DATA tensor of rank r >= 2, and INDICES tensor of rank q >= 1, gather entries of the second outer dimension (axis == 1) of DATA indexed by INDICES, and concatenate them in an output tensor of rank q + (r - 1). Example: DATA = [ [1.0, 1.2, 2.4, 4.5], [2.3, 3.4, 3.6, 2.3], [4.5, 5.7, 1.2, 4.5], ] INDICES = [0, 2] OUTPUT = [ [1.0, 2.4], [2.3, 3.6], [4.5, 1.2], ] )DOC").Input(0

Tensor of rank of any rank q	Output (0,"OUTPUT","Tensor of rank q + (r - 1).").InheritOnnxSchema()

	OPERATOR_SCHEMA (BatchGatherGradient).NumInputs(3).NumOutputs(1)

	REGISTER_GRADIENT (BatchGather, GetBatchGatherGradient)

	REGISTER_CPU_OPERATOR (BatchMatMul, BatchMatMulOp< CPUContext >)

vector< TensorShape >	TensorInferenceForBatchMatMul (const OperatorDef &def, const vector< TensorShape > &in)

OpSchema::Cost	CostInferenceForBatchMatMul (const OperatorDef &def, const vector< TensorShape > &in)

where A has	shape (dim0, dim1,...M, K)

where A has B has	shape (dim0, dim1,...K, N)

	REGISTER_CPU_OPERATOR (BatchMoments, BatchMomentsOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (BatchMomentsGradient, BatchMomentsGradientOp< float, CPUContext >)

	OPERATOR_SCHEMA (BatchMoments).NumInputs(1).NumOutputs(2)

	OPERATOR_SCHEMA (BatchMomentsGradient).NumInputs(3).NumOutputs(1)

	REGISTER_GRADIENT (BatchMoments, GetBatchMomentsGradient)

	REGISTER_CPU_OPERATOR (BatchSparseToDense, BatchSparseToDenseOp< float, CPUContext >)

	NumInputs (3, 4).NumOutputs(1).DisallowInputFillers().SetDoc(R"DOC( Convert sparse matrix representation into dense matrix. A sparse matrix is represented by `lengths` vector

	REGISTER_CPU_OPERATOR (BisectPercentile, BisectPercentileOp< CPUContext >)

with the size	of (batch_size, num_feature)

with the size where we also need additional information regarding the feature value distribution There are several vectors to keep data to percentile mappping information as arguments(context) the interpolation is apply	by (R[t], R[t+1]) and(U[t] and L[t]).As there are F features(F >

	REGISTER_CPU_OPERATOR (BooleanMask, BooleanMaskOp< CPUContext >)

	REGISTER_CPU_OPERATOR (BooleanMaskLengths, BooleanMaskLengthsOp< CPUContext >)

	SetDoc (R"DOC( Given a 1D `data` tensor and a boolean `mask` tensor of the same shape, returns a `masked_data` tensor containing only the elements corresponding to positions where the `mask` is True, and a `masked_indices` tensor containing the indices of the True elements. Github Links: - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/boolean_mask_ops.cc <details> <summary> <b>Example</b> </summary> Code ``` workspace.ResetWorkspace() op = core.CreateOperator( "BooleanMask", ["data", "mask"], ["masked_data", "masked_indices"] ) workspace.FeedBlob("data", np.array([1,2,3,4,5,6])) workspace.FeedBlob("mask", np.array([True,False,False,True,True,False])) print("data:", workspace.FetchBlob("data")) print("mask:", workspace.FetchBlob("mask")) workspace.RunOperatorOnce(op) print("masked_data:", workspace.FetchBlob("masked_data")) print("masked_indices:", workspace.FetchBlob("masked_indices")) ``` Result ``` data: [1 2 3 4 5 6] mask: [ True False False True True False] masked_data: [1 4 5] masked_indices: [0 3 4] ``` </details> )DOC").Input(0

same shape as data	Output (0,"masked_data","(Tensor): 1D tensor of same type as `data` input that contains the masked input tensor").Output(1

return the segment lengths of the corresponding segmented tensor after BooleanMask is applied If lengths tensor then length of mask tensor must be $a_1 a_2 a_n Github workspace	FeedBlob ("lengths", np.array([1, 3, 2], dtype=np.int32)) workspace.FeedBlob("mask"

return the segment lengths of the corresponding segmented tensor after BooleanMask is applied If lengths tensor then length of mask tensor must be $a_1 a_2 a_n Github workspace np array([False, True, True, False, True, True])) print("lengths	NO_GRADIENT (BooleanMaskLengths)

template<typename Functor >
void	MaskWithFunctor (size_t N, size_t M, int B, const float in, Functor fn, float fill_val, float out)

template<typename Functor >
void	RepeatedMaskWithFunctor (size_t N, size_t M, int D, const float in, Functor fn, float fill_val, float out)

	REGISTER_CPU_OPERATOR (SequenceMask, SequenceMaskOp< CPUContext >)

	REGISTER_CPU_OPERATOR (BooleanUnmask, BooleanUnmaskOp< CPUContext >)

	NumInputs ([](int n){return n > 0 &&n%2==0;}).NumOutputs(1).SetDoc(R"DOC( Given a series of masks and values

reconstruct values together according to masks A comprehensive False False True Reconstruct Note that for all mask there must be at least one True This is not False False we accept the first and no longer expect a value for that False True ***Note that we alternate data and mask inputs Github workspace	FeedBlob ("mask1", np.array([True, False, False, True, True, False])) workspace.FeedBlob("data1"

reconstruct values together according to masks A comprehensive False False True Reconstruct Note that for all mask there must be at least one True This is not False False we accept the first and no longer expect a value for that False True ***Note that we alternate data and mask inputs Github workspace np	array ([1, 4, 5])) workspace.FeedBlob("mask2"

	REGISTER_CPU_OPERATOR (ByteWeightDequant, ByteWeightDequantOp< CPUContext >)

	OPERATOR_SCHEMA (ByteWeightDequant).NumInputs(1).NumOutputs(1)

	REGISTER_CPU_OPERATOR (Cast, CastOp< CPUContext >)

out	push_back (in[0])

out[0]	set_data_type (cast::GetCastDataType(helper,"to"))

	SetDoc (R"DOC( Casts the elements of a given input tensor to a data type specified by the `to` argument and returns an output tensor of the same size in the converted type. The `to` argument must be one of the data types specified in the DataType enum field in the TensorProto message (see below). If the `to` argument is not provided or is not one of the enumerated types in DataType, Caffe2 throws an Enforce error. NOTE: Casting from strings is not supported, and casting to strings is only supported on CPU. TensorProto DataType field: ``` message TensorProto { ... enum DataType { UNDEFINED = 0; FLOAT = 1; // float INT32 = 2; // int BYTE = 3; // BYTE, when deserialized, is going to be restored as uint8. STRING = 4; // string BOOL = 5; // bool UINT8 = 6; // uint8_t INT8 = 7; // int8_t UINT16 = 8; // uint16_t INT16 = 9; // int16_t INT64 = 10; // int64_t FLOAT16 = 12; // at::Half DOUBLE = 13; // double } ``` Github Links: - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/cast_op.cc <details> <summary> <b>Example</b> </summary> Code ``` workspace.ResetWorkspace() op = core.CreateOperator( "Cast", ["X"], ["Y"], to=2 ) workspace.FeedBlob("X", (np.random.rand(3,3)).astype(np.float32)10) print("X:", workspace.FetchBlob("X")) workspace.RunOperatorOnce(op) print("Y:", workspace.FetchBlob("Y")) ``` Result* ``` X: [[9.436466 5.8529844 0.54932857] [1.1583444 2.9936118 0.22950427] [3.9143739 3.4040766 8.905341 ]] Y: [[9 5 0] [1 2 0] [3 3 8]] ``` </details> )DOC").Arg("to"

	REGISTER_GRADIENT (Cast, GetCastGradient)

	REGISTER_CPU_OPERATOR (Cbrt, UnaryElementwiseOp< TensorTypes< float >, CPUContext, CbrtFunctor< CPUContext >>)

	REGISTER_CPU_OPERATOR (CbrtGradient, BinaryElementwiseOp< TensorTypes< float >, CPUContext, CbrtGradientFunctor< CPUContext >>)

	IdenticalTypeAndShape ().Input(0

	REGISTER_GRADIENT (Cbrt, GetCbrtGradient)

	REGISTER_CPU_OPERATOR (Ceil, CeilOp< float, CPUContext >)

	SetDoc (R"DOC( Element-wise application of the ceil function ($y=ceil(x)$) to the input tensor `X`. Output tensor shape is the same as the input tensor. Github Link: - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/ceil_op.cc <details> <summary> <b>Example</b> </summary> Code ``` workspace.ResetWorkspace() op = core.CreateOperator( "Ceil", ["X"], ["X"], ) workspace.FeedBlob("X", (np.random.uniform(-10, 10, (5,5))).astype(np.float32)) print("X before running op:", workspace.FetchBlob("X")) workspace.RunOperatorOnce(op) print("X after running op:", workspace.FetchBlob("X")) ``` Result ``` X before running op: [[ 8.44598 -6.5098248 -2.2993476 -7.6859694 0.58566964] [-7.846551 -0.03689406 6.9362907 -4.0521703 4.4969673 ] [ 0.33355865 -7.895527 -8.393201 9.374202 -2.3930092 ] [-6.3061996 3.1403487 3.782099 -8.516556 -2.8387244 ] [-2.0164998 4.7663913 -3.422966 0.3636999 8.75713 ]] X after running op: [[ 9. -6. -2. -7. 1.] [-7. -0. 7. -4. 5.] [ 1. -7. -8. 10. -2.] [-6. 4. 4. -8. -2.] [-2. 5. -3. 1. 9.]] ``` </details> )DOC").Input(0

	GRADIENT_NOT_IMPLEMENTED_YET (Ceil)

	REGISTER_CPU_OPERATOR (ChannelBackpropStats, ChannelBackpropStatsOp< CPUContext >)

the gradient for the output of SpatialBN and the per channel mean and inverse std var vectors for the computes the per channel bias and scale gradient to be used during the backward pass for subsequent spatial batch normalization gradient calculation the results of this op are subsequently reduced over multiple devices to obtain statistics over a larger batch size in cases where the batch size for a single model copy is too low to yield the full benefit of batch normalization The resulting bias and scale can then be plugged back into SpatialBNGradient to get results over the larger batch size DOC	Input (0,"X","The input 4-dimensional tensor of shape NCHW").Input(1

the gradient for the output of SpatialBN and the per channel mean and inverse std var vectors for the computes the per channel bias and scale gradient to be used during the backward pass for subsequent spatial batch normalization gradient calculation the results of this op are subsequently reduced over multiple devices to obtain statistics over a larger batch size in cases where the batch size for a single model copy is too low to yield the full benefit of batch normalization The resulting bias and scale can then be plugged back into SpatialBNGradient to get results over the larger batch size DOC The mean saved from the forward pass as a dimensional tensor of size C	Input (2,"inv_std","The saved inverse standard deviation as a 1-dimensional tensor ""of size C.").Input(3

the gradient for the output of SpatialBN and the per channel mean and inverse std var vectors for the computes the per channel bias and scale gradient to be used during the backward pass for subsequent spatial batch normalization gradient calculation the results of this op are subsequently reduced over multiple devices to obtain statistics over a larger batch size in cases where the batch size for a single model copy is too low to yield the full benefit of batch normalization The resulting bias and scale can then be plugged back into SpatialBNGradient to get results over the larger batch size DOC The mean saved from the forward pass as a dimensional tensor of size C Gradient for the output layer of here used as input because we are on the backward pass	Output (0,"scale_grad","Gradient for the scale vector").Output(1

	SHOULD_NOT_DO_GRADIENT (ChannelBackpropStats)

	REGISTER_CPU_OPERATOR (ChannelShuffle, ChannelShuffleOp< float, CPUContext >)

	REGISTER_CPU_GRADIENT_OPERATOR (ChannelShuffleGradient, ChannelShuffleGradientOp< float, CPUContext >)

	REGISTER_GRADIENT (ChannelShuffle, GetChannelShuffleGradient)

	REGISTER_CPU_OPERATOR (ChannelStats, ChannelStatsOp< CPUContext >)

computes the sum of all elements per channel and the sum of all elements squared per channel These values can be reduced across multiple batches and used to obtain the mean and variance across the full set of batches Using the new mean and variance as input to SpatialBN has the effect of changing the batch size over which SpatialBN is applied DOC The output dimensional tensor of size C containing the sum of elements of X per channel	Output (1,"sumsq","The output 1-dimensional tensor of size C containing the sum of ""elements squared per channel.")

	SHOULD_NOT_DO_GRADIENT (ChannelStats)

	REGISTER_CPU_OPERATOR (Clip, ClipOp< float, CPUContext >)

	REGISTER_CPU_GRADIENT_OPERATOR (ClipGradient, ClipGradientOp< float, CPUContext >)

Key value handler for	rendezvous (optional).") .Output(0

Key value handler for A common world for collective operations	Arg ("size","(int) size of the common world.").Arg("rank"

Existing common world to clone	Output (0,"comm_world","A common world for collective operations.")

	SetDoc ("Closes all connections managed by a common world.").Input(0

	NumInputsOutputs ([](int in, int out){return in >=2 &&out==(in-1);}).EnforceInplace([](int in

	InputsCanCrossDevices ().IdenticalTypeAndShapeOfInput(0).SetDoc(R"DOC( Does a broadcast operation from the root node to every other node. The tensor on each node should have been pre-created with the same shape and data type. )DOC").Input(0

The common world	Input (1,"X","A tensor to be broadcasted.").Output(0

The common world In place as input	Arg ("root","(int, default 0) the root to run broadcast from.")

The common world	Input (1,"X","A tensor to be reduced.").Output(0

The common world The reduced result on not set for other nodes	Arg ("root","(int, default 0) the root to run reduce into.")

	IdenticalTypeAndShapeOfInput (0).InputsCanCrossDevices().SetDoc(R"DOC( Does an allreduce operation among the nodes. Currently only Sum is supported. )DOC").Input(0

The common world	Input (1,"X","A tensor to be allreduced.").Output(0

The common world	Input (1,"X","A tensor to be reduce-scattered.").Output(0

	NumInputs (2, INT_MAX).NumOutputs(1).InputsCanCrossDevices().SetDoc(R"DOC( Does an allgather operation among the nodes. )DOC").Input(0

The common world	Input (1,"X","A tensor to be allgathered.").Output(0

	NumInputs ({2, 4}).NumOutputs(0).SetDoc(R"DOC( Sends the tensor to another node. )DOC").Input(0

The common world An int CPUtensor of size specifying the rank If this overrides the to argument of the op	Input (3,"tag","An int CPUtensor of size 1 specifying the tag to ""send the tensor with. This overrides the 'tag' ""argument of the op.").Arg("dst"

The common world An int CPUtensor of size specifying the rank If this overrides the to argument of the op The rank to send the tensor to	Arg ("tag","(int) a tag to send the tensor with.").Arg("raw_buffer"

	AllowInplace ({{2, 1},{3, 2}}).SetDoc(R"DOC( Receives the tensor from another node. )DOC").Input(0

The common world	Input (1,"Y","In-place output. If raw_buffer is specified, ""Y should have pre-allocated data and type..").Input(2

The common world An int CPUtensor of size specifying the rank If this overrides the from argument of the op The received tensor	Output (1,"src","The sender that sent the message as a CPUTensor ""of size 1 and of type int.").Output(2

The common world An int CPUtensor of size specifying the rank If this overrides the from argument of the op The received tensor The tag that the message is sent with as a CPUTensor of size and of type int	Arg ("src","(int) he rank to receive the tensor from.").Arg("tag"

The common world An int CPUtensor of size specifying the rank If this overrides the from argument of the op The received tensor The tag that the message is sent with as a CPUTensor of size and of type int int a tag to receive the tensor with	Arg ("raw_buffer","(bool) if set, only send the content and assume that the receiver ""has already known the tensor's shape and information.")

	SHOULD_NOT_DO_GRADIENT (CreateCommonWorld)

	SHOULD_NOT_DO_GRADIENT (CloneCommonWorld)

	SHOULD_NOT_DO_GRADIENT (DestroyCommonWorld)

	SHOULD_NOT_DO_GRADIENT (Broadcast)

	SHOULD_NOT_DO_GRADIENT (Reduce)

	SHOULD_NOT_DO_GRADIENT (Allgather)

	SHOULD_NOT_DO_GRADIENT (Allreduce)

	SHOULD_NOT_DO_GRADIENT (ReduceScatter)

	SHOULD_NOT_DO_GRADIENT (Barrier)

	SHOULD_NOT_DO_GRADIENT (SendTensor)

	SHOULD_NOT_DO_GRADIENT (ReceiveTensor)

	REGISTER_CPU_OPERATOR (CreateCommonWorld, NoDefaultEngineOp< CPUContext >)

	REGISTER_CPU_OPERATOR (CloneCommonWorld, NoDefaultEngineOp< CPUContext >)

	REGISTER_CPU_OPERATOR (DestroyCommonWorld, NoDefaultEngineOp< CPUContext >)

	REGISTER_CPU_OPERATOR (Broadcast, NoDefaultEngineOp< CPUContext >)

	REGISTER_CPU_OPERATOR (Reduce, NoDefaultEngineOp< CPUContext >)

	REGISTER_CPU_OPERATOR (Allgather, NoDefaultEngineOp< CPUContext >)

	REGISTER_CPU_OPERATOR (Allreduce, NoDefaultEngineOp< CPUContext >)

	REGISTER_CPU_OPERATOR (ReduceScatter, NoDefaultEngineOp< CPUContext >)

	REGISTER_CPU_OPERATOR (Barrier, NoDefaultEngineOp< CPUContext >)

	REGISTER_CPU_OPERATOR (SendTensor, NoDefaultEngineOp< CPUContext >)

	REGISTER_CPU_OPERATOR (ReceiveTensor, NoDefaultEngineOp< CPUContext >)

	REGISTER_CUDA_OPERATOR (CreateCommonWorld, NoDefaultEngineOp< CUDAContext >)

	REGISTER_CUDA_OPERATOR (CloneCommonWorld, NoDefaultEngineOp< CUDAContext >)

	REGISTER_CUDA_OPERATOR (Broadcast, NoDefaultEngineOp< CUDAContext >)

	REGISTER_CUDA_OPERATOR (Reduce, NoDefaultEngineOp< CUDAContext >)

	REGISTER_CUDA_OPERATOR (Allgather, NoDefaultEngineOp< CUDAContext >)

	REGISTER_CUDA_OPERATOR (Allreduce, NoDefaultEngineOp< CUDAContext >)

	REGISTER_CUDA_OPERATOR (SendTensor, NoDefaultEngineOp< CUDAContext >)

	REGISTER_CUDA_OPERATOR (ReceiveTensor, NoDefaultEngineOp< CUDAContext >)

	REGISTER_CPU_OPERATOR (Split, SplitOp< CPUContext >)

	REGISTER_CPU_OPERATOR (SplitByLengths, SplitByLengthsOp< CPUContext >)

INT_MAX	Input (0,"input","(Tensor): tensor to split").Input(1

INT_MAX *	Tuple (int)):length of each output") .Arg( "order", "(string ):order of dimensions of input and output blobs;either\"NCHW\" or \"NHWC\"").Output(0,"[output_0, output_1, ...]","(Tensor): output tensor").DeviceInferenceFunction(splitOpDevInfer).SetDoc(R"DOC(Split an `input` tensor into a list of tensors, along the axis specified by the `axis` dimension. The lengths of the split can be specified using argument `split` or optional second input blob to the operator. Otherwise, the tensor is split to equal sized parts.Github Links:- https:<details><summary> <b>Example</b> </summary>Code```workspace.ResetWorkspace()op = core.CreateOperator( "Split", ["input"], ["output_0","output_1","output_2"], split=(3,2,4), axis=0)workspace.FeedBlob("input", np.random.randint(10, size=(9)))print("input:", workspace.FetchBlob("input"))workspace.RunOperatorOnce(op)print("output_0:", workspace.FetchBlob("output_0"))print("output_1:", workspace.FetchBlob("output_1"))print("output_2:", workspace.FetchBlob("output_2"))```Result*```input: [2 2 6 6 6 0 5 7 4]output_0: [2 2 6]output_1: [6 6]output_2: [0 5 7 4]```</details>)DOC").InheritOnnxSchema(

INT_MAX	Input (0,"input","The tensor to split").Input(1

INT_MAX The tensor l_i indicates the logic block of input	Arg ("axis","Which axis to split on").Arg("order"

INT_MAX The tensor l_i indicates the logic block of input Either NHWC or will split on C defaults to NCHW	DeviceInferenceFunction ([](const OperatorDef &def){auto op_device=def.has_device_option()?def.device_option():DeviceOption();vector< DeviceOption > in_dev(def.input_size(), op_device);vector< DeviceOption > out_dev(def.output_size(), op_device);in_dev[1]=DeviceOption();return std::make_pair(in_dev, out_dev);}).SetDoc(R"DOC( Split a tensor into a list of tensors

OpSchema::Cost	CostInferenceForConcat (const OperatorDef &def, const std::vector< TensorShape > &in)

std::vector< TensorShape >	TensorInferenceForConcat (const OperatorDef &def, const std::vector< TensorShape > &in)

	REGISTER_CUDA_OPERATOR (Split, SplitOp< CUDAContext >)

	REGISTER_CUDA_OPERATOR (Concat, ConcatOp< CUDAContext >)

	REGISTER_CUDA_OPERATOR (DepthSplit, SplitOp< CUDAContext >)

	REGISTER_CUDA_OPERATOR (DepthConcat, ConcatOp< CUDAContext >)

	REGISTER_CUDA_OPERATOR (SplitByLengths, SplitByLengthsOp< CUDAContext >)

	REGISTER_CPU_OPERATOR (Conditional, ConditionalOp< CPUContext >)

apply conditional	operator along the first dimension of DataT and DataF and return DataO.Note, DataT and DataF must have the exact same shape and type.) DOC") .Input (0,"Condition","Boolean tensor to select DataT or DataF").Input(1

apply conditional Data to use when True	Input (2,"DataF","Data to use when False").Output(0

apply conditional Data to use when True Output data after applying ConditionalOp	IdenticalTypeAndShapeOfInput (1)

	NO_GRADIENT (Conditional)

std::vector< TensorShape >	TensorInferenceForConvGradient (const OperatorDef &def, const std::vector< TensorShape > &in)

OpSchema::Cost	CostInferenceForConvGradient (const OperatorDef &def, const vector< TensorShape > &inputs)

	REGISTER_CPU_OPERATOR (ConvGradient, ConvGradientOp< float, CPUContext >)

	NumInputs (2, 3).NumOutputs(1

	TensorInferenceFunction (TensorInferenceForConvGradient).CostInferenceFunction(CostInferenceForConvGradient)

	REGISTER_CPU_OPERATOR (Conv1DGradient, ConvGradientOp< float, CPUContext >)

	OPERATOR_SCHEMA (Conv1DGradient).NumInputs(2

	NumOutputs (1, 3)

	REGISTER_CPU_OPERATOR (Conv2DGradient, ConvGradientOp< float, CPUContext >)

	OPERATOR_SCHEMA (Conv2DGradient).NumInputs(2

	REGISTER_CPU_OPERATOR (Conv3DGradient, ConvGradientOp< float, CPUContext >)

	OPERATOR_SCHEMA (Conv3DGradient).NumInputs(2

	REGISTER_GRADIENT (Conv, GetConvGradient)

	REGISTER_GRADIENT (Conv1D, GetConvGradient)

	REGISTER_GRADIENT (Conv2D, GetConvGradient)

	REGISTER_GRADIENT (Conv3D, GetConvGradient)

std::function< void(OpSchema &)>	ConvDocGenerator (const char *dim)

	REGISTER_CPU_OPERATOR (Conv, ConvOp< float, CPUContext >)

NumInputs(2, 3).NumOutputs(1).TensorInferenceFunction(ConvPoolOpBase< CPUContext >	REGISTER_CPU_OPERATOR (Conv1D, ConvOp< float, CPUContext >)

NumInputs(2, 3).NumOutputs(1).TensorInferenceFunction(ConvPoolOpBase< CPUContext >	REGISTER_CPU_OPERATOR (Conv2D, ConvOp< float, CPUContext >)

NumInputs(2, 3).NumOutputs(1).CostInferenceFunction(OpSchema	REGISTER_CPU_OPERATOR (Conv3D, ConvOp< float, CPUContext >)

	REGISTER_CUDNN_OPERATOR (Conv, CudnnConvOp)

	REGISTER_CUDNN_OPERATOR (ConvGradient, CudnnConvGradientOp)

	REGISTER_CUDNN_OPERATOR (Conv1D, CudnnConvOp)

	REGISTER_CUDNN_OPERATOR (Conv1DGradient, CudnnConvGradientOp)

	REGISTER_CUDNN_OPERATOR (Conv2D, CudnnConvOp)

	REGISTER_CUDNN_OPERATOR (Conv2DGradient, CudnnConvGradientOp)

	REGISTER_CUDNN_OPERATOR (Conv3D, CudnnConvOp)

	REGISTER_CUDNN_OPERATOR (Conv3DGradient, CudnnConvGradientOp)

	REGISTER_CUDA_OPERATOR (Conv, ConvOp< float, CUDAContext >)

	REGISTER_CUDA_OPERATOR (ConvGradient, ConvGradientOp< float, CUDAContext >)

	REGISTER_CUDA_OPERATOR (Conv1D, ConvOp< float, CUDAContext >)

	REGISTER_CUDA_OPERATOR (Conv1DGradient, ConvGradientOp< float, CUDAContext >)

	REGISTER_CUDA_OPERATOR (Conv2D, ConvOp< float, CUDAContext >)

	REGISTER_CUDA_OPERATOR (Conv2DGradient, ConvGradientOp< float, CUDAContext >)

	REGISTER_CUDA_OPERATOR (Conv3D, ConvOp< float, CUDAContext >)

	REGISTER_CUDA_OPERATOR (Conv3DGradient, ConvGradientOp< float, CUDAContext >)

template<>
void	createSharedBuffer< CPUContext > (Workspace *ws)

template<>
void	runWithSharedBuffer< CPUContext > (Workspace ws, std::function< void(Tensor buffer)> f)

template<typename Context >
void	createSharedBuffer (Workspace *ws)
	Creates a mutex and shared buffer in the workspace. More...

template<typename Context >
void	runWithSharedBuffer (Workspace ws, std::function< void(Tensor buffer)> f)
	Thread-safe, can be invoked from RunOnDevice() to serialize access to shared buffer.

template<>
void	createSharedBuffer< CUDAContext > (Workspace *ws)

template<>
void	runWithSharedBuffer< CUDAContext > (Workspace ws, std::function< void(Tensor buffer)> f)

	REGISTER_CPU_OPERATOR (ConvTransposeGradient, ConvTransposeGradientOp< float, CPUContext >)

	OPERATOR_SCHEMA (ConvTransposeGradient).NumInputs(3).NumOutputs(1

	REGISTER_GRADIENT (ConvTranspose, GetConvTransposeGradient)

	REGISTER_CPU_OPERATOR (ConvTranspose, ConvTransposeOp< float, CPUContext >)

an input weight tensor and optionally an input bias tensor $bias It then computes the transposed sometimes referred to as and produces a single output tensor $Y The hyperparameters of the op such as kernel and padding are specified as args At each the filter is deconvolved with a subset of $X and the $bias is added This is done throughout the input data until the output computation is complete The output shapes are computed as follows The number of channels in the output feature map is the number of kernels specified in the filter blob The spatial height and width are computed which is why they are separate files in the implementation this	*operator inherits from the ConvTransposeUnpoolOpBase operator.Github Links:-https:-https:-https:< details >< summary >< b >Example</b ></summary > Code ```workspace.ResetWorkspace* () op

	REGISTER_CUDNN_OPERATOR (ConvTranspose, CudnnConvTransposeOp< float >)

	REGISTER_CUDNN_OPERATOR (ConvTransposeGradient, CudnnConvTransposeGradientOp< float >)

	REGISTER_CUDA_OPERATOR (ConvTranspose, ConvTransposeOp< float, CUDAContext >)

	REGISTER_CUDA_OPERATOR (ConvTransposeGradient, ConvTransposeGradientOp< float, CUDAContext >)

	REGISTER_CPU_OPERATOR (CopyFromCPUInput, CopyOp< CPUContext, CPUContext, CPUContext >)

	REGISTER_CPU_OPERATOR (CopyOnDeviceLike, CopyOnDeviceLikeOp< CPUContext, CPUContext, CPUContext >)

	REGISTER_CPU_OPERATOR (Copy, CopyOp< CPUContext, CPUContext, CPUContext >)

	SetDoc (R"DOC( Copy tensor for GPU to CPU context. Must be run under GPU device option. )DOC").Input(0

The input tensor	Output (0,"output","Tensor that will contain a copy of the input.")

	SetDoc (R"DOC( Copy tensor for CPU to GPU context. Must be run under GPU device option. )DOC").Input(0

	SetDoc (R"DOC( Take a CPU input tensor and copy it to an output in the current Context (GPU or CPU). This may involves cross-device MemCpy. )DOC").Input(0

The input CPU tensor	Output (0,"output","either a TensorCUDA or a TensorCPU")

The input tensor	Input (1,"dst","Tensor, on which device the copy will be performed.").Output(0

	REGISTER_GRADIENT (Copy, GetCopyGradient)

	REGISTER_GRADIENT (CopyGPUToCPU, GetGPUToCPUGradient)

	REGISTER_GRADIENT (CopyCPUToGPU, GetCPUToGPUGradient)

	REGISTER_CPU_OPERATOR (Cos, UnaryElementwiseOp< TensorTypes< float >, CPUContext, CosFunctor< CPUContext >>)

	REGISTER_CPU_OPERATOR (CosGradient, BinaryElementwiseOp< TensorTypes< float >, CPUContext, CosGradientFunctor< CPUContext >>)

element wise Github workspace FeedBlob("X", np.random.rand(5).astype(np.float32)) print("X	OPERATOR_SCHEMA (CosGradient).NumInputs(2).NumOutputs(1).IdenticalTypeAndShape()

	REGISTER_GRADIENT (Cos, GetCosGradient)

	REGISTER_CPU_OPERATOR (Cosh, UnaryElementwiseOp< TensorTypes< float >, CPUContext, CoshFunctor< CPUContext >>)

	REGISTER_CPU_OPERATOR (CoshGradient, BinaryElementwiseOp< TensorTypes< float >, CPUContext, CoshGradientFunctor< CPUContext >>)

	REGISTER_GRADIENT (Cosh, GetCoshGradient)

	REGISTER_CPU_OPERATOR (CosineEmbeddingCriterion, CosineEmbeddingCriterionOp< CPUContext >)

	REGISTER_CPU_OPERATOR (CosineEmbeddingCriterionGradient, CosineEmbeddingCriterionGradientOp< CPUContext >)

	REGISTER_CPU_OPERATOR (CreateCounter, CreateCounterOp< int64_t, CPUContext >)

	REGISTER_CPU_OPERATOR (ResetCounter, ResetCounterOp< int64_t, CPUContext >)

	REGISTER_CPU_OPERATOR (CountDown, CountDownOp< int64_t, CPUContext >)

	REGISTER_CPU_OPERATOR (CheckCounterDone, CheckCounterDoneOp< int64_t, CPUContext >)

	REGISTER_CPU_OPERATOR (CountUp, CountUpOp< int64_t, CPUContext >)

	REGISTER_CPU_OPERATOR (RetrieveCount, RetrieveCountOp< int64_t, CPUContext >)

NumInputs(1).NumOutputs(1).ScalarType(TensorProto	SHOULD_NOT_DO_GRADIENT (CreateCounter)

	SHOULD_NOT_DO_GRADIENT (ResetCounter)

	SHOULD_NOT_DO_GRADIENT (CountDown)

	SHOULD_NOT_DO_GRADIENT (CountUp)

	SHOULD_NOT_DO_GRADIENT (RetrieveCount)

	CAFFE_KNOWN_TYPE (std::unique_ptr< Counter< int64_t >>)

	REGISTER_BLOB_SERIALIZER ((TypeMeta::Id< std::unique_ptr< Counter< int64_t >>>()), CounterSerializer)

	REGISTER_BLOB_DESERIALIZER (std::unique_ptr< Counter< int64_t >>, CounterDeserializer)

	REGISTER_CUDA_OPERATOR (CreateCounter, CreateCounterOp< int64_t, CUDAContext >)

	REGISTER_CUDA_OPERATOR (ResetCounter, ResetCounterOp< int64_t, CUDAContext >)

	REGISTER_CUDA_OPERATOR (CountDown, CountDownOp< int64_t, CUDAContext >)

	REGISTER_CUDA_OPERATOR (CheckCounterDone, CheckCounterDoneOp< int64_t, CUDAContext >)

	REGISTER_CUDA_OPERATOR (CountUp, CountUpOp< int64_t, CUDAContext >)

	REGISTER_CUDA_OPERATOR (RetrieveCount, RetrieveCountOp< int64_t, CUDAContext >)

	CAFFE_KNOWN_TYPE (detail::WorkspaceStack)

	REGISTER_CPU_OPERATOR (CreateScope, CreateScopeOp< CPUContext >)

	SHOULD_NOT_DO_GRADIENT (CreateScope)

	OPERATOR_SCHEMA (CreateScope).NumInputs(0).NumOutputs(1).SetDoc(R"DOC( 'CreateScope' operator initializes and outputs empty scope that is used by Do operator to store local blobs )DOC")

	REGISTER_CPU_OPERATOR (HasScope, HasScopeOp< CPUContext >)

	SHOULD_NOT_DO_GRADIENT (HasScope)

	OPERATOR_SCHEMA (HasScope).NumInputs(1).NumOutputs(1).SetDoc(R"DOC( Checks whether scope blob has any saved scopes left )DOC")

	REGISTER_CPU_OPERATOR (LabelCrossEntropy, LabelCrossEntropyOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (LabelCrossEntropyGradient, LabelCrossEntropyGradientOp< float, CPUContext >)

	SetDoc (R"DOC( This operator computes the cross entropy between a $NxD$ dimensional input data tensor $X$ and a one dimensional input label tensor $label$. The op produces a single length $N$ output tensor $Y$. Here, $N$ is considered the batch size and $D$ is the size of each element in the batch. In practice, it is most commonly used at the end of models as a part of the loss computation, after the SoftMax operator and before the AveragedLoss operator. The cross entropy operation is defined as follows $$Y_i = -log(X_{ij})$$ where ($i$, $j$) is the classifier's prediction of the $j$th class (the correct one), and $i$ is the batch size. Each log has a lower limit for numerical stability. The difference between LabelCrossEntropy and CrossEntropy is how the labels are specified. Here, the labels are a length $N$ list of integers, whereas in CrossEntropy the labels are a $NxD$ dimensional matrix of one hot label vectors. However, the results of computation should be the same, as shown in the two examples where ($i$, $j$) is the classifier's prediction of the $j$th class (the correct one), and $i$ is the batch size. Each log has a lower limit for numerical stability. Github Links: - https://github.com/caffe2/caffe2/blob/master/caffe2/operators/cross_entropy_op.h - https://github.com/caffe2/caffe2/blob/master/caffe2/operators/cross_entropy_op.cc <details> <summary> <b>Example</b> </summary> Code ``` workspace.ResetWorkspace() op = core.CreateOperator( "LabelCrossEntropy", ["X", "label"], ["Y"] ) // Create X: Sample softmax output for 5-class model X = np.array([[.01, .05, .02, .02, .9],[.03, .1, .42, .05, .4]]) print("X:\n",X) // Create label: Sample 1-hot ground truth label vectors label = np.array([4,2]) print("label:\n",label) // Feed X & label into workspace workspace.FeedBlob("X", X.astype(np.float32)) workspace.FeedBlob("label", label.astype(np.int32)) // Run op workspace.RunOperatorOnce(op) // Collect Output print("Y:\n", workspace.FetchBlob("Y")) ``` Result ``` X: [[0.01 0.05 0.02 0.02 0.9 ] [0.03 0.1 0.42 0.05 0.4 ]] label: [4 2] Y: [0.10536055 0.8675006 ] ``` </details> )DOC").Input(0

Input tensor which is almost always the result of a softmax operation $X is a array of size where $N is the batch size and $D is the number of classes	Input (1,"label","Blob containing the labels used to compare the input. $label$ is a length $N$ list of integers, where each element is the integer label for the $n$th element of the batch.").Output(0

	REGISTER_GRADIENT (LabelCrossEntropy, GetLabelCrossEntropyGradient)

	REGISTER_CPU_OPERATOR (MakeTwoClass, MakeTwoClassOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (MakeTwoClassGradient, MakeTwoClassGradientOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (SigmoidCrossEntropyWithLogits, SigmoidCrossEntropyWithLogitsOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (SigmoidCrossEntropyWithLogitsGradient, SigmoidCrossEntropyWithLogitsGradientOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (WeightedSigmoidCrossEntropyWithLogits, WeightedSigmoidCrossEntropyWithLogitsOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (WeightedSigmoidCrossEntropyWithLogitsGradient, WeightedSigmoidCrossEntropyWithLogitsGradientOp< float, CPUContext >)

out[0]	add_dims (in[0].dims(0))

out[0]	add_dims (2)

	SetDoc (R"DOC( Given a vector of probabilities, this operator transforms this into a 2-column matrix with complimentary probabilities for binary classification. In explicit terms, given the vector X, the output Y is vstack(1 - X, X). )DOC").Input(0

Input vector of probabilities	Output (0,"Y","2-column matrix with complimentary probabilities of X for ""binary classification")

	Arg ("log_D_trick", R"DOC( default is false; if enabled, will use the log d trick to avoid the vanishing gradients early on; see Goodfellow et. al (2014) )DOC").Arg("unjoined_lr_loss"

R	DOC (default is false;if enabled, the model will be allowed to train on an unjoined dataset, where some examples might be false negative and might appear in the dataset later as(true) positive example.) DOC") .NumInputs(2) .NumOutputs(1) .IdenticalTypeAndShapeOfInputDim(0

R	SetDoc (R"DOC( Given two matrices logits and targets, of same shape, (batch_size, num_classes), computes the sigmoid cross entropy between the two. Returns a tensor of shape (batch_size,) of losses for each example. )DOC").Input(0

R matrix of logits for each example and class	Input (1,"targets","matrix of targets, same shape as logits.").Output(0

	SetDoc (R"DOC( Given three matrices: logits, targets, weights, all of the same shape, (batch_size, num_classes), computes the weighted sigmoid cross entropy between logits and targets. Specifically, at each position r,c, this computes weights[r, c] * crossentropy(sigmoid(logits[r, c]), targets[r, c]), and then averages over each row. Returns a tensor of shape (batch_size,) of losses for each example. )DOC").Input(0

matrix of logits for each example and class matrix of same shape as logits	Output (0,"xentropy","Vector with the total xentropy for each example.")

	REGISTER_GRADIENT (MakeTwoClass, GetMakeTwoClassGradient)

	REGISTER_GRADIENT (SigmoidCrossEntropyWithLogits, GetSigmoidCrossEntropyWithLogitsGradient)

	REGISTER_GRADIENT (WeightedSigmoidCrossEntropyWithLogits, GetWeightedSigmoidCrossEntropyWithLogitsGradient)

	REGISTER_CPU_OPERATOR (CrossEntropy, CrossEntropyOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (CrossEntropyGradient, CrossEntropyGradientOp< float, CPUContext >)

	SetDoc (R"DOC( This operator computes the cross entropy between a $NxD$ dimensional input data tensor $X$ and a $NxD$ dimensional input label tensor $label$. The op produces a single length $N$ output tensor $Y$. Here, $N$ is considered the batch size and $D$ is the size of each element in the batch. In practice, it is most commonly used at the end of models as a part of the loss computation, after the SoftMax operator and before the AveragedLoss operator. The cross entropy operation is defined as follows $$Y_i = \sum_j (label_{ij} * log(X_{ij}))$$ where ($i$, $j$) is the classifier's prediction of the $j$th class (the correct one), and $i$ is the batch size. Each log has a lower limit for numerical stability. Github Links: - https://github.com/caffe2/caffe2/blob/master/caffe2/operators/cross_entropy_op.h - https://github.com/caffe2/caffe2/blob/master/caffe2/operators/cross_entropy_op.cc <details> <summary> <b>Example</b> </summary> Code ``` workspace.ResetWorkspace() op = core.CreateOperator( "CrossEntropy", ["X", "label"], ["Y"] ) // Create X: Sample softmax output for 5-class model X = np.array([[.01, .05, .02, .02, .9],[.03, .1, .42, .05, .4]]) print("X:\n",X) // Create label: Sample 1-hot ground truth label vectors label = np.array([[0.,0.,0.,0.,1.],[0.,0.,1.,0.,0.]]) print("label:\n",label) // Feed X & label into workspace workspace.FeedBlob("X", X.astype(np.float32)) workspace.FeedBlob("label", label.astype(np.float32)) // Run op workspace.RunOperatorOnce(op) // Collect Output print("Y:\n", workspace.FetchBlob("Y")) ``` Result ``` X: [[0.01 0.05 0.02 0.02 0.9 ] [0.03 0.1 0.42 0.05 0.4 ]] label: [[0. 0. 0. 0. 1.] [0. 0. 1. 0. 0.]] Y: [0.10536055 0.8675006 ] ``` </details> )DOC").Input(0

Input tensor which is almost always the result of a softmax operation $X is a array of size where $N is the batch size and $D is the number of classes	Input (1,"label","Blob containing the labels used to compare the input. $label$ is the same shape as $X$.").Output(0

	REGISTER_GRADIENT (CrossEntropy, GetCrossEntropyGradient)

	REGISTER_CPU_OPERATOR (CTCBeamSearchDecoder, CTCBeamSearchDecoderOp< CPUContext >)

Maximum number of candidates to carry over to next activation step	Arg ("prune_threshold","Probability threshold below which outputs are ignored.").Input(0

Maximum number of candidates to carry over to next activation step float Tensor sized[max_activation_length, batch_size, alphabet_size] of network	logits (before softmax application).") .Input( 1

Maximum number of candidates to carry over to next activation step float Tensor sized[max_activation_length, batch_size, alphabet_size] of network optional int vector containing sequence having size[batch_size] seq_len will be set to max_time if not provided	Output (0,"OUTPUT_LEN","Output_len matrix size (batch_size). ""Each index stores final output length of its corresponding batch item.").Output(1

Maximum number of candidates to carry over to next activation step float Tensor sized[max_activation_length, batch_size, alphabet_size] of network optional int vector containing sequence having size[batch_size] seq_len will be set to max_time if not provided Values	size (total_decoded_outputs)." "The flattened vector of final output sequences

Maximum number of candidates to carry over to next activation step float Tensor sized[max_activation_length, batch_size, alphabet_size] of network optional int vector containing sequence having size[batch_size] seq_len will be set to max_time if not provided Values in batch order	InheritOnnxSchema ()

	SHOULD_NOT_DO_GRADIENT (CTCBeamSearchDecoder)

	REGISTER_CPU_OPERATOR (CTCGreedyDecoder, CTCGreedyDecoderOp< CPUContext >)

When merge_repeated is merge repeated classes in output	SetDoc ("Greedy decoder for connectionist temporal classification.").Input(0

When merge_repeated is merge repeated classes in output float Tensor sized[max_time, batch_size, num_classes]	Input (1,"SEQ_LEN","(optional) 1D int vector containing sequence lengths, ""having size [batch_size]""seq_len will be set to max_time if not provided").Output(0

	REGISTER_CPU_OPERATOR (Cube, UnaryElementwiseOp< NumericTypes, CPUContext, CubeFunctor< CPUContext >>)

	REGISTER_CPU_OPERATOR (CubeGradient, BinaryElementwiseOp< NumericTypes, CPUContext, CubeGradientFunctor< CPUContext >>)

	REGISTER_GRADIENT (Cube, GetCubeGradient)

	REGISTER_CPU_OPERATOR (DataCouple, DataCoupleOp< CPUContext >)

	EnforceOneToOneInplace ().SetDoc(R"DOC( A one to one operator that takes an arbitrary number of input and output blobs such that each input blob is inplace with it's matching output blob. It then proceedes to do nothing with each of these operators. This serves two purposes. It can make it appear as if a blob has been written to

	CAFFE_KNOWN_TYPE (std::unique_ptr< dataset_ops::TreeCursor >)

	CAFFE_KNOWN_TYPE (dataset_ops::TensorVectorPtr)

	CAFFE_KNOWN_TYPE (dataset_ops::SharedTensorVectorPtr)

	OPERATOR_SCHEMA (DeformConvGradient).NumInputs(4

	NumOutputs (2, 4)

vector< TensorShape >	TensorInferenceForDotProduct (const OperatorDef &, const vector< TensorShape > &in)

OpSchema::Cost	CostInferenceForDotProduct (const OperatorDef &def, const vector< TensorShape > &in)

	REGISTER_CPU_OPERATOR (SquaredL2Distance, SquaredL2DistanceOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (SquaredL2DistanceGradient, SquaredL2DistanceGradientOp< float, CPUContext >)

	SetDoc (R"DOC( Given two input float tensors X, Y, and produces one output float tensor of the L2 difference between X and Y that is computed as \|\|(X - Y)^2 / 2\|\|. )DOC").Input(0

or input tensor	Input (1,"Y","1D or 2D input tensor (must have the same shape as X)").Output(0

	OPERATOR_SCHEMA (SquaredL2DistanceGradient).NumInputs(3).NumOutputs(2)

	REGISTER_GRADIENT (SquaredL2Distance, GetSquaredL2DistanceGradient)

	REGISTER_CPU_OPERATOR (L1Distance, L1DistanceOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (L1DistanceGradient, L1DistanceGradientOp< float, CPUContext >)

	SetDoc (R"DOC( Computes the row-wise L1 Distance between the two input tensors $X$ and $Y$, which is defined as $$L1Distance(\mathbf{x},\mathbf{y}) = \sum_{i}\mid x_i - y_i\mid$$ Note, both inputs must either be 1-dimensional or 2-dimensional and both must have the same shape. The output $Z$ will be 1-dimensional regardless and its length will equal the number of rows in the inputs. Github Links: - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/distance_op.h - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/distance_op.cc <details> <summary> <b>Example</b> </summary> Code ``` workspace.ResetWorkspace() op = core.CreateOperator( "L1Distance", ["X", "Y"], ["Z"] ) // Create X X = 5np.ones((1, 4)) print("X:\n",X) // Create Y Y = np.ones((1, 4)) print("Y:\n",Y) // Feed X & Y into workspace workspace.FeedBlob("X", X.astype(np.float32)) workspace.FeedBlob("Y", Y.astype(np.float32)) // Run op workspace.RunOperatorOnce(op) // Collect Output print("Z:\n", workspace.FetchBlob("Z")) ``` Result* ``` X: [[5. 5. 5. 5.]] Y: [[1. 1. 1. 1.]] Z: [16.] ``` </details> )DOC").Input(0

First input	tensor (1D or 2D)") .Input(1

First input Second input	tensor (must have the same shape as $X $)") .Output(0

	OPERATOR_SCHEMA (L1DistanceGradient).NumInputs(3).NumOutputs(2)

	REGISTER_GRADIENT (L1Distance, GetL1DistanceGradient)

	REGISTER_CPU_OPERATOR (DotProduct, DotProductOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (DotProductGradient, DotProductGradientOp< float, CPUContext >)

	SetDoc (R"DOC( Computes and outputs the dot product of the two input float tensors `X` and `Y`. Note that `X` and `Y` must be either 1D or 2D, and they must be the same shape. The output tensor is 1D, which represents either the product of each element in a respective dimension if the inputs are 1D, or the sum of the products in a given dimension if the inputs are 2D matrices. Note that the actual dot product is a scalar value, which is effectively the sum of the elements in the 1D output tensor. For 1D inputs: Given two vectors $X = [x_0, x_1, x_2]$ and $Y = [y_0, y_1, y_2]$; $Z = [x_0 * y_0, x_1 * y_1, x_2 * y_2]$ For 2D inputs: Given two matrices: $$X = [[x_0^0, x_1^0, x_2^0], \\ [x_0^1, x_1^1, x_2^1], \\ [x_0^2, x_1^2, x_2^2], \\ ..., \\ [x_0^n, x_1^n, x_2^n]]$$ and $$Y = [[y_0^0, y_1^0, y_2^0], \\ [y_0^1, y_1^1, y_2^1], \\ [y_0^2, y_1^2, y_2^2], \\ ..., \\ [y_0^n, y_1^n, y_2^n]]$$ then $$Z = \biggl[\Big((x_0^0 * y_0^0) + (x_1^0 * y_1^0) + (x_2^0 * y_2^0)\Big), \\ \Big((x_0^1 * y_0^1) + (x_1^1 * y_1^1) + (x_2^1 * y_2^1)\Big), \\ \Big((x_0^2 * y_0^2) + (x_1^2 * y_1^2) + (x_2^2 * y_2^2)\Big), \\ ..., \\ \Big((x_0^n * y_0^n) + (x_1^n * y_1^n) + (x_2^n * y_2^n)\Big)\biggr]$$ Github Link: - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/distance_op.cc <details> <summary> <b>Example</b> </summary> Code ``` workspace.ResetWorkspace() op = core.CreateOperator( "DotProduct", ["X", "Y"], ["Z"] ) workspace.FeedBlob("X", np.random.randint(20, size=(5)).astype(np.float32)) workspace.FeedBlob("Y", np.random.randint(20, size=(5)).astype(np.float32)) print("X:\n", workspace.FetchBlob("X")) print("Y:\n", workspace.FetchBlob("Y")) workspace.RunOperatorOnce(op) print("Z:\n", workspace.FetchBlob("X")) workspace.ResetWorkspace() workspace.FeedBlob("X", np.random.randint(10, size=(3,3)).astype(np.float32)) workspace.FeedBlob("Y", np.random.randint(10, size=(3,3)).astype(np.float32)) print("X:\n", workspace.FetchBlob("X")) print("Y:\n", workspace.FetchBlob("Y")) workspace.RunOperatorOnce(op) print("Z:\n", workspace.FetchBlob("Z")) ``` Result ``` X: [ 2. 15. 2. 7. 12.] Y: [ 3. 12. 9. 3. 18.] Z: [ 2. 15. 2. 7. 12.] X: [[2. 0. 4.] [7. 7. 4.] [7. 9. 9.]] Y: [[2. 0. 8.] [9. 6. 1.] [7. 8. 0.]] Z: [ 36. 109. 121.] ``` </details> )DOC").Input(0

	OPERATOR_SCHEMA (DotProductGradient).NumInputs(3).NumOutputs(2)

	REGISTER_GRADIENT (DotProduct, GetDotProductGradient)

	REGISTER_CPU_OPERATOR (CosineSimilarity, CosineSimilarityOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (CosineSimilarityGradient, CosineSimilarityGradientOp< float, CPUContext >)

	SetDoc (R"DOC( This op takes two input float tensors of the same size, $X$ and $Y$, and produces one output float tensor , $Z$, calculated as the cosine similarity between $X$ and $Y$. Recall, the cosine similarity between two tensors $X$ and $Y$ is defined as: $$\mathbf{Z}=CosineSimilarity(\mathbf{X},\mathbf{Y}) = \frac{\mathbf{X}\cdot\mathbf{Y}}{\\|\mathbf{X}\\|\\|\mathbf{Y}\\|} = \frac{\sum_n^{i=1}X_iY_i}{\sqrt{\sum_n^{i=1}X_i^2}\sqrt{\sum_n^{i=1}Y_i^2}}$$ Github Links: - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/distance_op.h - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/distance_op.cc <details> <summary> <b>Example</b> </summary> Code ``` workspace.ResetWorkspace() op = core.CreateOperator( "CosineSimilarity", ["X", "Y"], ["Z"] ) // Create X X = np.random.randn(3, 3) print("X:\n",X) // Create Y Y = np.random.randn(3, 3) print("Y:\n",Y) // Feed X & Y into workspace workspace.FeedBlob("X", X.astype(np.float32)) workspace.FeedBlob("Y", Y.astype(np.float32)) // Run op workspace.RunOperatorOnce(op) // Collect Output print("Z:\n", workspace.FetchBlob("Z")) ``` Result ``` X: [[-0.42635564 -0.23831588 -0.25515547] [ 1.43914719 -1.05613228 1.01717373] [ 0.06883105 0.33386519 -1.46648334]] Y: [[-0.90648691 -0.14241514 -1.1070837 ] [ 0.92152729 -0.28115511 -0.17756722] [-0.88394254 1.34654037 -0.80080998]] Z: [-1.7849885e-23 1.7849885e-23 -1.0842022e-07] ``` </details> )DOC").Input(0

	OPERATOR_SCHEMA (CosineSimilarityGradient).NumInputs(3).NumOutputs(2)

	REGISTER_GRADIENT (CosineSimilarity, GetCosineSimilarityGradient)

	REGISTER_CPU_OPERATOR (DotProductWithPadding, DotProductWithPaddingOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (DotProductWithPaddingGradient, DotProductWithPaddingGradientOp< float, CPUContext >)

Y with different shapes and produces one output float tensor of the dot product between X and Y We currently support two kinds of strategies to achieve this Before doing normal dot_product pad the smaller	tensor (using pad_value) to the same shape as the other one.2) replicate the smaller tensor to the same shape as the other one.Note the first dimension of X

Y with different shapes and produces one output float tensor of the dot product between X and Y We currently support two kinds of strategies to achieve this Before doing normal dot_product pad the smaller Y must be equal Only the second dimension of X or Y can be padded DOC	Input (0,"X","1D or 2D input tensor").Input(1

Y with different shapes and produces one output float tensor of the dot product between X and Y We currently support two kinds of strategies to achieve this Before doing normal dot_product pad the smaller Y must be equal Only the second dimension of X or Y can be padded DOC or input tensor	Output (0,"Z","1D output tensor").IdenticalTypeAndShapeOfInputDim(0

Y with different shapes and produces one output float tensor of the dot product between X and Y We currently support two kinds of strategies to achieve this Before doing normal dot_product pad the smaller Y must be equal Only the second dimension of X or Y can be padded DOC or input tensor	Arg ("pad_value","the padding value for tensors with smaller dimension").Arg("replicate"

	OPERATOR_SCHEMA (DotProductWithPaddingGradient).NumInputs(3).NumOutputs(2)

	REGISTER_GRADIENT (DotProductWithPadding, GetDotProductWithPaddingGradient)

	REGISTER_CPU_OPERATOR (Do, DoOp< CPUContext >)

	NumInputs (1, INT_MAX).NumOutputs(1

INT_MAX	SetDoc (R"DOC( 'Do' control operator, executes a subnet in a separate workspace. Last blobs in the input and output lists should be the same blob created with CreateScope op. Arguments 'inner_blobs' and 'outer_blobs_idx' provide a mapping between selected inner blob names and corresponding outer blob indices. )DOC").Arg("net"

INT_MAX Subnet with blob bindings	Arg ("inner_blobs","List of inner net blob names to bind to outer workspace").Arg("outer_blobs_idx"

INT_MAX Subnet with blob bindings Indices of corresponding outer workspace in	operator outputs (skipping workspace blobs)") .Arg( "saved_fwd_blobs"

INT_MAX Subnet with blob bindings Indices of corresponding outer workspace in List of blobs from the forward Do	operator workspace needed" "in backward pass, used in gradient Do operator") .Arg ("reuse_workspace","Whether to reuse workspace or create a new one in a given scope").AllowInplace([](int in

	REGISTER_CUDA_OPERATOR (Do, DoOp< CUDAContext >)

	REGISTER_CPU_OPERATOR (Dropout, DropoutOp< float, CPUContext >)

	REGISTER_CPU_GRADIENT_OPERATOR (DropoutGrad, DropoutGradientOp< float, CPUContext >)

	if (def.output().size()==2)

	SetDoc (R"DOC( `Dropout` takes one input data tensor (`X`) and produces two tensor outputs, `Y` and `mask`. If the `is_test` argument is zero (default=0), the output `Y` will be the input with random elements zeroed. The probability that a given element is zeroed is determined by the `ratio` argument. If the `is_test` argument is set to non-zero, the output `Y` is exactly the same as the input `X`. Note that outputs are scaled by a factor of $\frac{1}{1-ratio}$ during training, so that during test time, we can simply compute an identity function. This scaling is important because we want the output at test time to equal the expected value at training time. Dropout has been proven to be an effective regularization technique to prevent overfitting during training. Github Links: - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/dropout_op.h - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/dropout_op.cc <details> <summary> <b>Example</b> </summary> Code ``` workspace.ResetWorkspace() op = core.CreateOperator( "Dropout", ["X"], ["Y"] + ["mask"], ratio=0.5, is_test=0 ) workspace.FeedBlob("X", np.random.randint(10, size=(5, 5)).astype(np.float32)) print("X:", workspace.FetchBlob("X")) workspace.RunOperatorOnce(op) print("Y:", workspace.FetchBlob("Y")) print("mask:", workspace.FetchBlob("mask")) ``` Result ``` X: [[5. 4. 3. 6. 9.] [2. 1. 8. 0. 9.] [7. 3. 0. 6. 3.] [1. 8. 2. 6. 4.] [6. 2. 6. 4. 0.]] Y: [[ 0. 0. 0. 12. 18.] [ 0. 0. 16. 0. 0.] [ 0. 0. 0. 12. 6.] [ 0. 0. 4. 0. 0.] [12. 0. 0. 0. 0.]] mask: [[False False False True True] [False False True True False] [False False True True True] [False False True False False] [ True False False False False]] ``` </details> )DOC").Arg("ratio"

default perform dropout If non	zero (test mode)

	REGISTER_GRADIENT (Dropout, GetDropoutGradient)

	REGISTER_CPU_OPERATOR (AddGradient, BinaryElementwiseGradientOp< NumericTypes, CPUContext, AddFunctor< CPUContext >>)

	REGISTER_GRADIENT (Add, GetAddGradient)

	REGISTER_CPU_OPERATOR (Add, BinaryElementwiseOp< NumericTypes, CPUContext, AddFunctor< CPUContext >>)

	REGISTER_CUDA_OPERATOR (Add, BinaryElementwiseOp< NumericTypes, CUDAContext, AddFunctor< CUDAContext >>)

	REGISTER_CUDA_OPERATOR (AddGradient, BinaryElementwiseGradientOp< NumericTypes, CUDAContext, AddFunctor< CUDAContext >>)

	REGISTER_CPU_OPERATOR (DivGradient, BinaryElementwiseGradientOp< NumericTypes, CPUContext, DivFunctor< CPUContext >>)

	REGISTER_GRADIENT (Div, GetDivGradient)

	REGISTER_CPU_OPERATOR (Div, BinaryElementwiseOp< NumericTypes, CPUContext, DivFunctor< CPUContext >>)

	REGISTER_CPU_OPERATOR (ElementwiseLinear, ElementwiseLinearOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (ElementwiseLinearGradient, ElementwiseLinearGradientOp< float, CPUContext >)

	REGISTER_GRADIENT (ElementwiseLinear, GetElementwiseLinearGradient)

	REGISTER_CPU_OPERATOR (MulGradient, BinaryElementwiseGradientOp< NumericTypes, CPUContext, MulFunctor< CPUContext >>)

	REGISTER_GRADIENT (Mul, GetMulGradient)

	REGISTER_CPU_OPERATOR (Mul, BinaryElementwiseOp< NumericTypes, CPUContext, MulFunctor< CPUContext >>)

	REGISTER_CPU_OPERATOR (Not, UnaryElementwiseOp< BoolTypes, CPUContext, NotFunctor< CPUContext >>)

	REGISTER_CPU_OPERATOR (Sign, UnaryElementwiseOp< NumericTypes, CPUContext, SignFunctor< CPUContext >>)

	REGISTER_CPU_COMPARE_OPERATOR (EQ)

	REGISTER_CPU_COMPARE_OPERATOR (NE)

	REGISTER_CPU_COMPARE_OPERATOR (LT)

	REGISTER_CPU_COMPARE_OPERATOR (LE)

	REGISTER_CPU_COMPARE_OPERATOR (GT)

	REGISTER_CPU_COMPARE_OPERATOR (GE)

	REGISTER_CPU_LOGICAL_BINARY_OPERATOR (And)

	REGISTER_CPU_LOGICAL_BINARY_OPERATOR (Or)

	REGISTER_CPU_LOGICAL_BINARY_OPERATOR (Xor)

	REGISTER_CPU_BITWISE_BINARY_OPERATOR (BitwiseAnd)

	REGISTER_CPU_BITWISE_BINARY_OPERATOR (BitwiseOr)

	REGISTER_CPU_BITWISE_BINARY_OPERATOR (BitwiseXor)

	REGISTER_CPU_OPERATOR (SumReduceLike, SumReduceLikeOp< CPUContext >)

	C10_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR (EQ)

	C10_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR (NE)

	C10_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR (LT)

	C10_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR (LE)

	C10_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR (GT)

	C10_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR (GE)

	C10_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR (And)

	C10_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR (Or)

	C10_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR (Xor)

	C10_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR (BitwiseAnd)

	C10_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR (BitwiseOr)

	C10_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR (BitwiseXor)

	CostInferenceFunction (PointwiseCostInference< 1 >).TensorInferenceFunction(ElementwiseOpShapeInference).FillUsing(MathDocGenerator("addition"

and the dimensions of the second input is the contiguous subset of the dimensions of the first For the following tensor shapes are	shape (B)

and the dimensions of the second input is the contiguous subset of the dimensions of the first For the following tensor shapes are i e B is a scalar	shape (A)

	REGISTER_CPU_OPERATOR (SubGradient, BinaryElementwiseGradientOp< NumericTypes, CPUContext, SubFunctor< CPUContext >>)

	REGISTER_GRADIENT (Sub, GetSubGradient)

	REGISTER_CPU_OPERATOR (Sub, BinaryElementwiseOp< NumericTypes, CPUContext, SubFunctor< CPUContext >>)

	REGISTER_CUDA_OPERATOR (Sub, BinaryElementwiseOp< NumericTypes, CUDAContext, SubFunctor< CUDAContext >>)

	REGISTER_CUDA_OPERATOR (SubGradient, BinaryElementwiseGradientOp< NumericTypes, CUDAContext, SubFunctor< CUDAContext >>)

	REGISTER_CPU_OPERATOR (Sum, SumOp< CPUContext >)

	CostInferenceFunction (CostInferenceForSum).InputsCanCrossDevices().IdenticalTypeAndShapeOfInput(0).SetDoc(R"DOC( Element-wise sum of each of the input tensors. The first input tensor can be used in-place as the output tensor

in which case the sum will be done in place and results will be accumulated the first input tensor All inputs and outputs must have the same shape and data type Github workspace	FeedBlob ("A", np.array([[1, 2],[3, 4]]).astype(np.float32)) workspace.FeedBlob("B"

	REGISTER_CPU_OPERATOR (Elu, UnaryElementwiseWithArgsOp< TensorTypes< float >, CPUContext, EluFunctor< CPUContext >>)

	REGISTER_CPU_GRADIENT_OPERATOR (EluGradient, BinaryElementwiseWithArgsOp< TensorTypes< float >, CPUContext, EluGradientFunctor< CPUContext >>)

	REGISTER_CUDNN_OPERATOR (Elu, CuDNNActivationOp< CUDNN_ACTIVATION_ELU >)

	REGISTER_CUDNN_OPERATOR (EluGradient, CuDNNActivationGradientOp< CUDNN_ACTIVATION_ELU >)

	REGISTER_CPU_OPERATOR (EnforceFinite, EnforceFiniteOp< CPUContext >)

	SHOULD_NOT_DO_GRADIENT (EnforceFinite)

	REGISTER_CPU_OPERATOR (EnsureClipped, EnsureClippedOp< float, CPUContext >)

	NumInputs (1, 3).NumOutputs(1).Input(0

Parameters to be normalized	Input (1,"indices","Sparse indices, only needed for sparse param").Input(2

Parameters to be normalized Gradient only needed for sparse param	Output (0,"output_param","param ensured to be clipped within range").AllowInplace(

	SetDoc (R"DOC( Given a tensor, apply clip after gradient is applied; when the param is sparse as indicated by valid indices and grad, in-place is required )DOC")

	SHOULD_NOT_DO_GRADIENT (EnsureClipped)

	REGISTER_CPU_OPERATOR (EnsureCPUOutput, EnsureCPUOutputOp< CPUContext >)

	SetDoc (R"DOC( This Op always create TensorCPU output, and may involves cross-device MemCpy. Under CPU Context, this Op takes TensorCPU as input. Under the CUDA Context, this Op accepts either CUDA or CPU Tensor input. )DOC").Input(0

The input CUDA or CPU tensor	Output (0,"output","TensorCPU that is a copy of the input.")

	NO_GRADIENT (EnsureCPUOutput)

	REGISTER_CPU_OPERATOR (Erf, UnaryElementwiseOp< TensorTypes< float >, CPUContext, ErfFunctor< CPUContext >>)

	REGISTER_CPU_OPERATOR (ErfGradient, BinaryElementwiseOp< TensorTypes< float >, CPUContext, ErfGradientFunctor< CPUContext >>)

	REGISTER_GRADIENT (Erf, GetErfGradient)

	REGISTER_CPU_OPERATOR (Exp, UnaryElementwiseOp< TensorTypes< float >, CPUContext, ExpFunctor< CPUContext >>)

	REGISTER_GRADIENT (Exp, GetExpGradient)

	REGISTER_CUDA_OPERATOR (Exp, UnaryElementwiseOp< TensorTypes< float >, CUDAContext, ExpFunctor< CUDAContext >>)

	REGISTER_CPU_OPERATOR (Expand, ExpandOp< TensorTypes< std::int32_t, std::int64_t, float, double >, CPUContext >)

	REGISTER_CPU_OPERATOR (ExpandGradient, ExpandGradientOp< TensorTypes< std::int32_t, std::int64_t, float, double >, CPUContext >)

NumInputs(2).NumOutputs(1).SetDoc(R"DOC( Broadcast the input tensor to a materialized new tensor using given shape. Broadcast rule is similar to "numpy.array(input)*numpy.ones(shape)" Two corresponding dimensions must have the same or one of them equals to In order to align with PyTorch s shape is allowed to have entries equal which means to preserve the size of the corresponding dimension in	X (so it's actually equivalent to equal to 1).) DOC") .Input(0

	OPERATOR_SCHEMA (ExpandGradient).NumInputs(2).NumOutputs(1)

	REGISTER_GRADIENT (Expand, GetExpandGradient)

	REGISTER_CUDA_OPERATOR (Expand, ExpandOp< TensorTypes< std::int32_t, std::int64_t, float, double >, CUDAContext >)

	REGISTER_CUDA_OPERATOR (ExpandGradient, ExpandGradientOp< TensorTypes< std::int32_t, std::int64_t, float, double >, CUDAContext >)

	REGISTER_CPU_OPERATOR (ExpandDims, ExpandDimsOp< CPUContext >)

	REGISTER_CPU_OPERATOR (Squeeze, SqueezeOp< CPUContext >)

	TensorInferenceFunction ([](const OperatorDef &def, const vector< TensorShape > &in){ArgumentHelper helper(def);auto dims=helper.template GetRepeatedArgument< int >("dims");auto originalSize=dims.size();CAFFE_ENFORCE(originalSize > 0,"Parameter `dims` must be provided.");std::sort(dims.begin(), dims.end());dims.erase(std::unique(dims.begin(), dims.end()), dims.end());if(dims.size()< originalSize){LOG(WARNING)<< "Parameter `dims` has repeated dimensions.";}CAFFE_ENFORCE(dims.front() >=0,"Dimension ids must be non-negative.");CAFFE_ENFORCE_GE(in[0].dims_size()+dims.size(), dims.back()+1,"Input needs at least ",(1+dims.back()-dims.size())," dimensions given `dims`.");vector< TensorShape > out(1);int cur_pos=0;int idx=0;for(const auto new_dim:dims){for(int i=cur_pos;i< new_dim;i++){out[0].add_dims(in[0].dims(idx++));}out[0].add_dims(1);cur_pos=new_dim+1;}for(;idx< in[0].dims_size();idx++){out[0].add_dims(in[0].dims(idx));}out[0].set_data_type(in[0].data_type());return out;}).SetDoc(R"DOC( The ExpandDims op inserts single-dimensional entries into the shape of the input tensor *data

	SetDoc (R"DOC( The Squeeze op removes single-dimensional entries from the shape of the input tensor data, and produces a single output tensor squeezed. The op also takes an argument dims with a list of dimensions to squeeze. If the same blob is provided as input and output, the operation is copy-free. This is the exact inverse operation of ExpandDims given the same dims argument. Github Links: - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/expand_squeeze_dims_op.h - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/expand_squeeze_dims_op.cc <details> <summary> <b>Example</b> </summary> Code ``` workspace.ResetWorkspace() op = core.CreateOperator( "Squeeze", ["data"], ["squeezed"], dims=[0,1], ) workspace.FeedBlob("data", np.zeros((1,1,100,100)).astype(np.float32)) print("data.shape:", workspace.FetchBlob("data").shape) workspace.RunOperatorOnce(op) print("squeezed.shape:", workspace.FetchBlob("squeezed").shape) ``` Result ``` data.shape: (1, 1, 100, 100) squeezed.shape: (100, 100) ``` </details> )DOC").Input(0

Input tensor of data to be operated on	Output (0,"squeezed","Reshaped tensor with same data as input.").Arg("dims"

dims	erase (std::unique(dims.begin(), dims.end()), dims.end())

	if (dims.size()< originalSize)

	CAFFE_ENFORCE (dims.front() >=0,"Dimension ids must be non-negative.")

vector< TensorShape >	out (1)

	REGISTER_GRADIENT (Squeeze, GetSqueezeGradient)

	REGISTER_GRADIENT (ExpandDims, GetExpandDimsGradient)

	REGISTER_CUDA_OPERATOR (Squeeze, SqueezeOp< CUDAContext >)

	REGISTER_CUDA_OPERATOR (ExpandDims, ExpandDimsOp< CUDAContext >)

std::vector< TensorShape >	FCShapeInference (const OperatorDef &def, const vector< TensorShape > &in, bool pretransposed_weight)

OpSchema::Cost	CostInferenceForFC (const OperatorDef &def, const vector< TensorShape > &in, bool pretransposed_weight)

	REGISTER_CPU_OPERATOR (FeedBlob, FeedBlobOp< CPUContext >)

	SHOULD_NOT_DO_GRADIENT (FeedBlob)

	NumInputs (0, 0).NumOutputs(1

	SetDoc (R"DOC( FeedBlobs the content of the blobs. The input and output blobs should be one-to-one inplace.)DOC").Arg("value"

	REGISTER_CPU_OPERATOR (UniformFill, UniformFillOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (UniformIntFill, UniformFillOp< int, CPUContext >)

	REGISTER_CPU_OPERATOR (UniqueUniformFill, UniqueUniformFillOp< CPUContext >)

	REGISTER_CPU_OPERATOR (ConstantFill, ConstantFillOp< CPUContext >)

	REGISTER_CPU_OPERATOR (DiagonalFill, DiagonalFillOp< CPUContext >)

	REGISTER_CPU_OPERATOR (GaussianFill, GaussianFillOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (XavierFill, XavierFillOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (MSRAFill, MSRAFillOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (RangeFill, RangeFillOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (LengthsRangeFill, LengthsRangeFillOp< CPUContext >)

	TensorInferenceFunction (FillerTensorInference<>).SetDoc(R"DOC( This operator fills the elements of the output tensor with a const ant value specified by the `value` argument. - The data type is specified by the `dtype` argument - Currently

the data types supported are float int32 int64 and bool *If the dtype argument is not the data type of value is used The output tensor shape is either specified by the shape argument or will match the shape of the input tensor if one is	provided (if an input tensor is provided, a shape argument should not be set)-Optional additional dimensions can be appended at the end as specified by`extra_shape`argument-If`input_as_shape`is set to True

the data types supported are float int32 int64 and bool *If the dtype argument is not the data type of value is used The output tensor shape is either specified by the shape argument or will match the shape of the input tensor if one is the input should be a tensor containing the desired output	shape (the dimensions specified in`extra_shape`will also be appended) When specifying`dtype`argument

shape input must be in CPU context	Input (0,"shape","(Tensor`<int>`): 1-D tensor of the shape of the output, must be used with `input_as_shape` argument").Input(1

shape input must be in CPU context inclusive	Input (2,"max","(Tensor`<float>`): scalar tensor containing maximum value, inclusive").Output(0

	NumInputs ({0, 1, 3}).NumOutputs(1).AllowInplace(

	TensorInferenceFunction (FillerTensorInference< TensorProto_DataType_INT32 >).SetDoc(R"DOC( Fill the output tensor with int32 samples from uniform distribution [`min`

max The range can be defined either by arguments or input blobs min and max are inclusive If the range is given by input you also need to give the shape as input When the range is given as this	operator enforces min<=max.When the range is given as inputs, the constraint is not enforced.-When the range is given as inputs and max< min, the first dimension of the output is set to 0.This behavior is allowed so that dynamically sampling indices into a dynamically sized tensor is possible.-The shape of the output can be given as argument or input.Github Links:-https:-https:< details >< summary >< b >Example</b ></summary > Code ```workspace.ResetWorkspace () op_1

shape input must be in CPU context inclusive	Input (2,"max","(Tensor`<int>`): scalar tensor containing maximum value, inclusive").Output(0

	NumInputs (0, 2).NumOutputs(1).AllowInplace(

its elements will be excluded from uniform sampling Using the second input will require you to provide shape via the first input DOC	Arg ("min","Minimum value, inclusive").Arg("max"

its elements will be excluded from uniform sampling Using the second input will require you to provide shape via the first input DOC Maximum inclusive	Arg ("dtype","The data type for the elements of the output tensor.""Strictly must be one of the types from DataType enum in TensorProto.""This only supports INT32 and INT64 now. If not set, assume INT32").Arg("shape"

its elements will be excluded from uniform sampling Using the second input will require you to provide shape via the first input DOC Maximum inclusive The shape of the output tensor Cannot set the shape argument and pass in an input at the same time	Arg ("extra_shape","The additional dimensions appended at the end of the shape indicated""by the input blob. ""Cannot set the extra_shape argument when there is no input blob.").Arg("input_as_shape"

its elements will be excluded from uniform sampling Using the second input will require you to provide shape via the first input DOC Maximum inclusive The shape of the output tensor Cannot set the shape argument and pass in an input at the same time tensor containing the desired output shape First input must be in CPU context	Input (0,"input","Input tensor to provide shape information").Input(1

its elements will be excluded from uniform sampling Using the second input will require you to provide shape via the first input DOC Maximum inclusive The shape of the output tensor Cannot set the shape argument and pass in an input at the same time tensor containing the desired output shape First input must be in CPU context optional Avoid elements in this tensor Elements must be unique	Output (0,"output","Output tensor of unique uniform samples")

if input_as_shape is set to true then the input *should be a tensor containing the desired output	shape (the dimensions specified in extra_shape will also be appended).In this case

template<int VALUE_TYPE = TensorProto_DataType_FLOAT>
std::vector< TensorShape >	FillerTensorInference (const OperatorDef &def, const vector< TensorShape > &in)

	Index (integers)") .Input(1

Needles query	Output (0,"query_indices","Indices of the needles in index or 'missing value'").Arg("missing_value"

Needles query Placeholder for items that are not found	SetDoc (R"DOC( Finds elements of second input from first input, outputting the last (max) index for each query. If query not find, inserts missing_value. See IndexGet() for a version that modifies the index when values are not found. )DOC")

	REGISTER_CPU_OPERATOR (Flatten, FlattenOp< CPUContext >)

std::vector< TensorShape >	TensorInferenceForFlatten (const OperatorDef &def, const std::vector< TensorShape > &in)

	REGISTER_CPU_OPERATOR (FlexibleTopK, FlexibleTopKOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (FlexibleTopKGradient, FlexibleTopKGradientOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (Floor, FloorOp< float, CPUContext >)

	SetDoc (R"DOC( Element-wise application of the floor function ($y=floor(x)$) to the input tensor `X`. Output tensor shape is the same as the input tensor. This operator can be used in an in-place fashion by using the same input blob as the output blob. Github Link: - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/floor_op.cc <details> <summary> <b>Example</b> </summary> Code ``` workspace.ResetWorkspace() op = core.CreateOperator( "Floor", ["X"], ["X"], ) workspace.FeedBlob("X", (np.random.uniform(-10, 10, (5,5))).astype(np.float32)) print("X before running op:", workspace.FetchBlob("X")) workspace.RunOperatorOnce(op) print("X after running op:", workspace.FetchBlob("X")) ``` Result ``` X before running op: [[ 3.813361 -1.319647 5.2089314 -4.931328 0.6218652 ] [ 7.2757645 5.5552588 5.785643 -2.4790506 -0.41400087] [ 1.1541046 -6.933266 3.3754056 1.6569928 -1.7670316 ] [-3.4932013 4.891472 1.5530115 -3.2443287 -4.605099 ] [-4.574543 -7.360948 5.91305 -8.196495 -5.357458 ]] X after running op: [[ 3. -2. 5. -5. 0.] [ 7. 5. 5. -3. -1.] [ 1. -7. 3. 1. -2.] [-4. 4. 1. -4. -5.] [-5. -8. 5. -9. -6.]] ``` </details> )DOC").Input(0

	GRADIENT_NOT_IMPLEMENTED_YET (Floor)

	REGISTER_CPU_OPERATOR (Free, FreeOp< CPUContext >)

	SHOULD_NOT_DO_GRADIENT (Free)

INT_MAX	SameNumberOfOutput ().EnforceOneToOneInplace().SetDoc(R"DOC( Frees the content of the blobs. The input and output blobs should be one-to-one inplace.)DOC")

	REGISTER_CUDA_OPERATOR (Free, FreeOp< CUDAContext >)

	REGISTER_CPU_OPERATOR (FC, FullyConnectedOp< CPUContext >)

	REGISTER_CPU_GRADIENT_OPERATOR (FCGradient, FullyConnectedGradientOp< CPUContext >)

	REGISTER_CPU_OPERATOR (FCTransposed, FullyConnectedOp< CPUContext, DefaultEngine, false >)

	REGISTER_CPU_GRADIENT_OPERATOR (FCTransposedGradient, FullyConnectedGradientOp< CPUContext, DefaultEngine, false >)

	REGISTER_CUDA_OPERATOR (FC, FullyConnectedOp< CUDAContext >)

	REGISTER_CUDA_OPERATOR (FCGradient, FullyConnectedGradientOp< CUDAContext >)

	REGISTER_CUDA_OPERATOR (FCTransposed, FullyConnectedOp< CUDAContext, DefaultEngine, false >)

	REGISTER_CUDA_OPERATOR (FCTransposedGradient, FullyConnectedGradientOp< CUDAContext, DefaultEngine, false >)

	REGISTER_CPU_OPERATOR (FloatToFused8BitRowwiseQuantized, FloatToFused8BitRowwiseQuantizedOp< float, convertfp32fp32, CPUContext >)

X	set_dims (1, X.dims(1)+8)

out	push_back (std::move(X))

out[0]	set_data_type (TensorProto_DataType_UINT8)

	SetDoc (R"DOC( Applies 8-bit row-wise quantization by determining the range (maximum - minimum) and offset (minimum value) of each row in the input matrix, and then scaling each element to an 8-bit number between 0 and 255. To later de-quantize values, the scale (range / 255) and offset (bias) are stored alongside the data. More precisely, the first 4 bytes of each row in the output matrix are a 32-bit float storing the scale, the next 4 bytes store the bias as a 32-bit float, and all remaining bytes in the row encode single quantized values.) )DOC").Input(0

Float32 input data	Output (0,"output","Fused scale, bias and quantized data")

	NO_GRADIENT (FloatToFused8BitRowwiseQuantized)

	REGISTER_CPU_OPERATOR (HalfFloatToFused8BitRowwiseQuantized, FloatToFused8BitRowwiseQuantizedOp< at::Half, convertfp16fp32, CPUContext >)

	NO_GRADIENT (HalfFloatToFused8BitRowwiseQuantized)

	REGISTER_CPU_OPERATOR (Fused8BitRowwiseQuantizedToFloat, Fused8BitRowwiseQuantizedToFloatOp< float, convertfp32fp32, CPUContext >)

X	set_dims (1, X.dims(1)-8)

out[0]	set_data_type (TensorProto_DataType_FLOAT)

	SetDoc (R"DOC( De-quantizes the result of the FloatToFused8BitRowwiseQuantized operator. The input is expected to encode the scale as a 32-bit float in the second to the last 4 bytes of each row, followed by the bias as a 32-bit float in the next 4 bytes, and the quantized values in the preceding bytes of the row. The output is a matrix containing only the values, but de-quantized. De-quantization is performed by multiplying each value by its row's scale and bias parameters. The de-quantized values will thus not be exactly equal to the original, un-quantized floating point values. )DOC").Input(0

Fused bias and quantized data	Output (0,"float_output","Float32 data")

	NO_GRADIENT (Fused8BitRowwiseQuantizedToFloat)

	REGISTER_CPU_OPERATOR (Fused8BitRowwiseQuantizedToHalfFloat, Fused8BitRowwiseQuantizedToFloatOp< at::Half, convertfp32fp16, CPUContext >)

out[0]	set_data_type (TensorProto_DataType_FLOAT16)

	SetDoc (R"DOC( De-quantizes the result of the HalfFloatToFused8BitRowwiseQuantized operator. The input is expected to encode the scale as a 32-bit float in the second to the last 4 bytes of each row, followed by the bias as a 32-bit float in the next 4 bytes, and the quantized values in the preceding bytes of the row. The output is a matrix containing only the values, but de-quantized. De-quantization is performed by multiplying each value by its row's scale and bias parameters. The de-quantized values will thus not be exactly equal to the original, un-quantized floating point values. )DOC").Input(0

Fused bias and quantized data	Output (0,"float16_output","Float16 data")

	NO_GRADIENT (Fused8BitRowwiseQuantizedToHalfFloat)

	REGISTER_CPU_OPERATOR (FloatToFusedRandRowwiseQuantized, FloatToFusedRandRowwiseQuantizedOp< CPUContext >)

X	set_dims (1, 10+(X.dims(1)+data_per_byte-1)/data_per_byte)

	SetDoc (R"DOC( Applies row-wise stochastic/random quantization by determining the range of each row in the input matrix, and then quantize each element to one of two closest discrete levels by randomly drawing Bernoulli distribution. The method is extended from TernGrad [1], which randomly quantizes gradients to three levels to reduce communication in distributed training. The format of each row (x) in the output matrix is [bitwidth][tail][min][max][data]: bitwidth[1 Byte]: bitwidth per data [1, 2, 4 or 8]; tail[1 Byte]: the number of unused buckets [1-8] (One byte is split to 8/bitwidth buckets and each bucket stores one low-precision data in bitwidth bits); min[4 Bytes]: the minimum floating value min(x); max[4 Bytes]: the maximum floating value max(x); data: quantized data. The quantization is uniform with levels q = min + (max-min)/(2^bitwidth - 1)*[0:1:2^bitwidth]. During stochastic/random quantization x'=Quantize(x), for q_j < x_i <= q_{j+1}, we draw quantization x'_i from Bernoulli distributions with P(x'_i = q_{j+1}) = (x_i - q_j)/(q_{j+1} - q_j), and P(x'_i = q_j) = (q_{j+1} - x_i)/(q_{j+1} - q_j) where x'_i is the quantized value of x_i. [1] proved E{x'_i}=x_i, which is an unbiased approximation. More details are in the paper. For example, suppose targeted bitwidth = 2 and x = [0.3, -1.4, -0.6, 0.9, 1.0], then tail = 3, min = -1.4, max = 1.0 and q = [-1.4, -0.6, 0.2, 1.0]. x_1 = 0.3 will be quantized to x'_1 = 0.2 with probability 7/8 and to x'_1 = 1.0 with probability 1/8. The storage format of quantized data is: [x'_1\|x'_3\|x'_5\|xxx]-[x'_2\|x'_4\|xxx\|xxx]. In general, a input row is split to multiple segments. One segment is a continuous subarray of the row, and its length is the number of bytes storing quantized data in the output matrix. The b-th bucket of the i-th byte stores the i-th data of the b-th segment of input row. [1] Wen, Wei, Cong Xu, Feng Yan, Chunpeng Wu, Yandan Wang, Yiran Chen, and Hai Li. "Terngrad:Ternary gradients to reduce communication in distributed deep learning." In Advances in Neural Information Processing Systems, pp. 1508-1518. 2017. )DOC").Input(0

Float32 input data	Output (0,"output","Fused bitwidth, tail, min, max and quantized data").Arg("bitwidth"

Float32 input data How many bits to quantiz per	data (defaults to 8).") .Arg("random"

Float32 input data How many bits to quantiz per random or	not (True).False is set up for unittest.")

	NO_GRADIENT (FloatToFusedRandRowwiseQuantized)

	REGISTER_CPU_OPERATOR (FusedRandRowwiseQuantizedToFloat, FusedRandRowwiseQuantizedToFloatOp< CPUContext >)

const vector< TensorShape > &	for (int i=0;i< def.output_size();i++)

	SetDoc (R"DOC( De-quantizes the result of the FloatToFusedRandRowwiseQuantized operator. Refer FloatToFusedRandRowwiseQuantized operator for details. )DOC").Input(0

Fused max and quantized data	Output (0,"float_input","Float32 data")

	NO_GRADIENT (FusedRandRowwiseQuantizedToFloat)

but operating on bit rowwise quantized matrices with fused	storage (where each row stores quantized values, and then the scale and offset).DATA needs to have rank 2 and INDICES needs to have rank 1.) DOC") .Input( 0

but operating on bit rowwise quantized matrices with fused uint8 tensor with rank obtained with	operator FloatToFused8BitRowwiseQuantized") .Input (1,"INDICES","Integer vector containing indices of the first dimension of DATA for""the rows that are being gathered").Output(0

but operating on bit rowwise quantized matrices with fused uint8 tensor with rank obtained with output	TensorInferenceFunction ([](const OperatorDef &def, const vector< TensorShape > &in){vector< TensorShape > out(1);for(auto d:in[1].dims()){out[0].add_dims(d);}for(int i=1;i< in[0].dims_size();++i){out[0].add_dims(in[0].dims(i));}out[0].set_data_type(in[0].data_type());return out;})

	REGISTER_CPU_OPERATOR (GatherFused8BitRowwise, GatherFused8BitRowwiseOp< CPUContext >)

	REGISTER_CPU_OPERATOR (Gather, GatherOp< CPUContext >)

	REGISTER_CPU_OPERATOR (GenerateProposals, GenerateProposalsOp< CPUContext >)

	REGISTER_CPU_OPERATOR (GenerateProposalsCPP, GenerateProposalsOp< CPUContext >)

bounding box regression result deltas as well as predefined bounding box shapes anchors Greedy non maximum suppression is applied to generate the final bounding boxes DOC	Arg ("spatial_scale","(float) spatial scale").Arg("pre_nms_topN"

bounding box regression result deltas as well as predefined bounding box shapes anchors Greedy non maximum suppression is applied to generate the final bounding boxes DOC int RPN_PRE_NMS_TOP_N	Arg ("post_nms_topN","(int) RPN_POST_NMS_TOP_N").Arg("nms_thresh"

bounding box regression result deltas as well as predefined bounding box shapes anchors Greedy non maximum suppression is applied to generate the final bounding boxes DOC int RPN_PRE_NMS_TOP_N float RPN_NMS_THRESH	Arg ("min_size","(float) RPN_MIN_SIZE").Arg("angle_bound_on"

bounding box regression result deltas as well as predefined bounding box shapes anchors Greedy non maximum suppression is applied to generate the final bounding boxes DOC int RPN_PRE_NMS_TOP_N float RPN_NMS_THRESH	bool (default true).If set

bounding box regression result deltas as well as predefined bounding box shapes anchors Greedy non maximum suppression is applied to generate the final bounding boxes DOC int RPN_PRE_NMS_TOP_N float RPN_NMS_THRESH for rotated angle is normalized to be within[angle_bound_lo, angle_bound_hi]	Arg ("angle_bound_lo","int (default -90 degrees). If set, for rotated boxes, angle is ""normalized to be within [angle_bound_lo, angle_bound_hi].").Arg("angle_bound_hi"

bounding box regression result deltas as well as predefined bounding box shapes anchors Greedy non maximum suppression is applied to generate the final bounding boxes DOC int RPN_PRE_NMS_TOP_N float RPN_NMS_THRESH for rotated angle is normalized to be within[angle_bound_lo, angle_bound_hi]	int (default 90 degrees).If set

bounding box regression result deltas as well as predefined bounding box shapes anchors Greedy non maximum suppression is applied to generate the final bounding boxes DOC int RPN_PRE_NMS_TOP_N float RPN_NMS_THRESH for rotated angle is normalized to be within[angle_bound_lo, angle_bound_hi] for rotated angle is normalized to be within[angle_bound_lo, angle_bound_hi]	Arg ("clip_angle_thresh","float (default 1.0 degrees). For RRPN, clip almost horizontal boxes ""within this threshold of tolerance for backward compatibility. ""Set to negative value for no clipping.").Input(0

bounding box regression result deltas as well as predefined bounding box shapes anchors Greedy non maximum suppression is applied to generate the final bounding boxes DOC int RPN_PRE_NMS_TOP_N float RPN_NMS_THRESH for rotated angle is normalized to be within[angle_bound_lo, angle_bound_hi] for rotated angle is normalized to be within[angle_bound_lo, angle_bound_hi] Scores from conv	size (img_count, A, H, W)") .Input( 1

bounding box regression result deltas as well as predefined bounding box shapes anchors Greedy non maximum suppression is applied to generate the final bounding boxes DOC int RPN_PRE_NMS_TOP_N float RPN_NMS_THRESH for rotated angle is normalized to be within[angle_bound_lo, angle_bound_hi] for rotated angle is normalized to be within[angle_bound_lo, angle_bound_hi] Scores from conv Bounding box deltas from conv	size (img_count, 4 *A, H, W)") .Input( 2

bounding box regression result deltas as well as predefined bounding box shapes anchors Greedy non maximum suppression is applied to generate the final bounding boxes DOC int RPN_PRE_NMS_TOP_N float RPN_NMS_THRESH for rotated angle is normalized to be within[angle_bound_lo, angle_bound_hi] for rotated angle is normalized to be within[angle_bound_lo, angle_bound_hi] Scores from conv Bounding box deltas from conv Image	size (img_count, 3)

bounding box regression result deltas as well as predefined bounding box shapes anchors Greedy non maximum suppression is applied to generate the final bounding boxes DOC int RPN_PRE_NMS_TOP_N float RPN_NMS_THRESH for rotated angle is normalized to be within[angle_bound_lo, angle_bound_hi] for rotated angle is normalized to be within[angle_bound_lo, angle_bound_hi] Scores from conv Bounding box deltas from conv Image	format (height, width, scale)") .Input(3

bounding box regression result deltas as well as predefined bounding box shapes anchors Greedy non maximum suppression is applied to generate the final bounding boxes DOC int RPN_PRE_NMS_TOP_N float RPN_NMS_THRESH for rotated angle is normalized to be within[angle_bound_lo, angle_bound_hi] for rotated angle is normalized to be within[angle_bound_lo, angle_bound_hi] Scores from conv Bounding box deltas from conv Image Bounding box	size (A, 4)") .Output( 0

bounding box regression result deltas as well as predefined bounding box shapes anchors Greedy non maximum suppression is applied to generate the final bounding boxes DOC int RPN_PRE_NMS_TOP_N float RPN_NMS_THRESH for rotated angle is normalized to be within[angle_bound_lo, angle_bound_hi] for rotated angle is normalized to be within[angle_bound_lo, angle_bound_hi] Scores from conv Bounding box deltas from conv Image Bounding box	size (n x 5)

bounding box regression result deltas as well as predefined bounding box shapes anchors Greedy non maximum suppression is applied to generate the final bounding boxes DOC int RPN_PRE_NMS_TOP_N float RPN_NMS_THRESH for rotated angle is normalized to be within[angle_bound_lo, angle_bound_hi] for rotated angle is normalized to be within[angle_bound_lo, angle_bound_hi] Scores from conv Bounding box deltas from conv Image Bounding box	format (image_index, x1, y1, x2, y2)") .Output(1

bounding box regression result deltas as well as predefined bounding box shapes anchors Greedy non maximum suppression is applied to generate the final bounding boxes DOC int RPN_PRE_NMS_TOP_N float RPN_NMS_THRESH for rotated angle is normalized to be within[angle_bound_lo, angle_bound_hi] for rotated angle is normalized to be within[angle_bound_lo, angle_bound_hi] Scores from conv Bounding box deltas from conv Image Bounding box scores of	size (n)")

	OPERATOR_SCHEMA (GenerateProposalsCPP).NumInputs(4).NumOutputs(2)

	SHOULD_NOT_DO_GRADIENT (GenerateProposals)

	SHOULD_NOT_DO_GRADIENT (GenerateProposalsCPP)

	REGISTER_CPU_OPERATOR (GivenTensorByteStringToUInt8Fill, GivenTensorByteStringToUInt8FillOp< CPUContext >)

	NO_GRADIENT (GivenTensorByteStringToUInt8Fill)

	SetDoc (R"DOC( This op fills a uint8 output tensor with the data specified by the value argument. The data must previously be serialized as a byte string. The output tensor shape is specified by the shape argument. Beware, when using this argument value should have a value for every element of the output, as missing values will not be initialized automatically. If input_as_shape is set to true, then the input should be a 1D tensor containing the desired output shape (the dimensions specified in extra_shape will also be appended). In this case, the shape argument should not be set. This op allows us to write uint8 tensors to Protobuf as byte strings and read them back as uint8 tensors in order to avoid the Protobuf uint32_t varint encoding size penalty. <details> <summary> <b>Example</b> </summary> Code ``` workspace.ResetWorkspace() val = np.array([1, 2, 3], dtype=np.uint8) op = core.CreateOperator( "GivenTensorByteStringToUInt8Fill", [], ["out"], values=[val.tobytes()], shape=val.shape, ) workspace.RunOperatorOnce(op) print("Out:\n", workspace.FetchBlob("out")) ``` Result ``` Out: [1 2 3] ``` </details> )DOC").Arg("values"

The value for the elements of the output true	Arg ("shape","The shape of the output tensor.""Cannot set the shape argument and pass in an input at the same time.").Arg("extra_shape"

The value for the elements of the output true The additional dimensions appended at the end of the shape indicated by the input blob Cannot set the extra_shape argument when there is no input blob	Arg ("input_as_shape","1D tensor containing the desired output shape. First input must be in CPU context.").TensorInferenceFunction(FillerTensorInference< TensorProto_DataType_STRING >)

	REGISTER_CPU_OPERATOR (GivenTensorFill, GivenTensorFillOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (GivenTensorDoubleFill, GivenTensorFillOp< double, CPUContext >)

	REGISTER_CPU_OPERATOR (GivenTensorBoolFill, GivenTensorFillOp< bool, CPUContext >)

	REGISTER_CPU_OPERATOR (GivenTensorIntFill, GivenTensorFillOp< int, CPUContext >)

	REGISTER_CPU_OPERATOR (GivenTensorInt64Fill, GivenTensorFillOp< int64_t, CPUContext >)

	REGISTER_CPU_OPERATOR (GivenTensorStringFill, GivenTensorFillOp< std::string, CPUContext >)

	NO_GRADIENT (GivenTensorFill)

	NO_GRADIENT (GivenTensorDoubleFill)

	NO_GRADIENT (GivenTensorBoolFill)

	NO_GRADIENT (GivenTensorIntFill)

	NO_GRADIENT (GivenTensorInt64Fill)

	NO_GRADIENT (GivenTensorStringFill)

	SetDoc (R"DOC( This op fills an output tensor with the data specified by the value and dtype arguments. The output tensor shape is specified by the shape argument. Beware, when using this argument value should have a value for every element of the output, as missing values will not be initialized automatically. If input_as_shape is set to true, then the input should be a 1D tensor containing the desired output shape (the dimensions specified in extra_shape will also be appended). In this case, the shape argument should not be set. Note: Do not set the shape argument and pass in an input at the same time. Github Links: - https://github.com/caffe2/caffe2/blob/master/caffe2/operators/given_tensor_fill_op.h - https://github.com/caffe2/caffe2/blob/master/caffe2/operators/given_tensor_fill_op.cc <details> <summary> <b>Example</b> </summary> Code ``` workspace.ResetWorkspace() op = core.CreateOperator( "GivenTensorFill", [], ["out"], values=[1., 2., 3.], shape=[3], ) workspace.RunOperatorOnce(op) print("Out:\n", workspace.FetchBlob("out")) ``` Result ``` Out: [1. 2. 3.] ``` </details> )DOC").Arg("values"

	REGISTER_CPU_OPERATOR (GroupNorm, GroupNormOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (GroupNormGradient, GroupNormGradientOp< float, CPUContext >)

	SetDoc (R"DOC( Group Normalization (GN) operation: https://arxiv.org/abs/1803.08494 )DOC").Arg("num_groups"

number of groups used by GN	Arg ("epsilon","(float) default 1e-5; small constant added to var.").Input(0

	REGISTER_CPU_OPERATOR (GRUUnit, GRUUnitOp< float, CPUContext >)

in a sequence length aware fashion given	the (fused) inputs X(TxNxD)

in a sequence length aware fashion given the previous hidden	state (NxD)

in a sequence length aware fashion given the previous hidden and the sequence	lengths (N)

in a sequence length aware fashion given the previous hidden and the sequence computes the GRU avoiding computation if the input is	invalid (as in, the value at X[t][n] >=seqLengths[n].) DOC") .Arg( "drop_states"

in a sequence length aware fashion given the previous hidden and the sequence computes the GRU avoiding computation if the input is Bool to determine if hidden state is zeroes or passed along for timesteps past the given sequence_length	Arg ("sequence_lengths","When false, the sequence lengths input is left out, ""and all following inputs are shifted left by one.").Output(0

	REGISTER_CPU_OPERATOR (GRUUnitGradient, GRUUnitGradientOp< float, CPUContext >)

	NumInputs (5, 6).NumOutputs(2).Arg("sequence_lengths"

	REGISTER_GRADIENT (GRUUnit, GetGRUUnitGradient)

	REGISTER_CPU_OPERATOR (FloatToHalf, FloatToHalfOp< CPUContext >)

	REGISTER_CPU_OPERATOR (HalfToFloat, HalfToFloatOp< CPUContext >)

out	push_back (X)

	REGISTER_CPU_OPERATOR (Float16ConstantFill, Float16ConstantFillOp)

	REGISTER_CPU_OPERATOR (Float16UniformFill, Float16UniformFillOp)

max	Arg ("shape","Shape of the tensor").Arg("min"

max Minimim value to generate	Arg ("max","Maximum value to generate")

	NO_GRADIENT (Float16UniformFill)

The value for the elements of the output tensor	Arg ("shape","The shape of the output tensor.").Output(0

	REGISTER_GRADIENT (FloatToHalf, GetFloatToHalfGradient)

	REGISTER_GRADIENT (HalfToFloat, GetHalfToFloatGradient)

	NO_GRADIENT (Float16ConstantFill)

std::vector< TensorShape >	Float16FillerTensorInference (const OperatorDef &def, const vector< TensorShape > &in)

	REGISTER_CPU_OPERATOR (HardSigmoid, UnaryElementwiseWithArgsOp< TensorTypes< float >, CPUContext, HardSigmoidFunctor< CPUContext >>)

	REGISTER_CPU_OPERATOR (HardSigmoidGradient, BinaryElementwiseWithArgsOp< TensorTypes< float >, CPUContext, HardSigmoidGradientFunctor< CPUContext >>)

	CostInferenceFunction (CostInferenceForHardSigmoid).IdenticalTypeAndShape().SetDoc(R"DOC( Applies hard sigmoid operation to the input data element-wise. The HardSigmoid operation takes one input $X$

	REGISTER_CPU_OPERATOR (If, IfOp< CPUContext >)

INT_MAX	SetDoc (R"DOC( 'If' control operator, first input is a scalar boolean blob that stores condition value. Accepts 'then_net' (required) and 'else_net' (optional) arguments for 'then' and 'else' subnets respectively. Subnets are executed in the same workspace as 'If'. )DOC").Arg("then_net"

INT_MAX Net executed when condition is true	Arg ("else_net","Net executed when condition is false (optional)").Input(0

INT_MAX Net executed when condition is true Scalar boolean condition	AllowInplace ([](int in, int out) -> bool{return true;})

	REGISTER_CUDA_OPERATOR (If, IfOp< CUDAContext >)

	REGISTER_CPU_OPERATOR (Im2Col, Im2ColOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (Col2Im, Col2ImOp< float, CPUContext >)

	REGISTER_GRADIENT (Im2Col, GetIm2ColGradient)

	REGISTER_GRADIENT (Col2Im, GetCol2ImGradient)

	switch (order)

	CAFFE_ENFORCE (H >=dkernel_h)

	CAFFE_ENFORCE (W >=dkernel_w)

	Input (0,"X","4-tensor in NCHW or NHWC.").Output(0

	OPERATOR_SCHEMA (Col2Im).NumInputs(2).NumOutputs(1)

	REGISTER_CUDA_OPERATOR (Im2Col, Im2ColOp< float, CUDAContext >)

	REGISTER_CUDA_OPERATOR (Col2Im, Col2ImOp< float, CUDAContext >)

	REGISTER_CPU_OPERATOR (IntIndexCreate, IndexCreateOp< int32_t >)

	REGISTER_CPU_OPERATOR (LongIndexCreate, IndexCreateOp< int64_t >)

	REGISTER_CPU_OPERATOR (StringIndexCreate, IndexCreateOp< std::string >)

	REGISTER_CPU_OPERATOR (IndexGet, IndexGetOp)

	REGISTER_CPU_OPERATOR (IndexLoad, IndexLoadOp)

	REGISTER_CPU_OPERATOR (IndexStore, IndexStoreOp)

	REGISTER_CPU_OPERATOR (IndexFreeze, IndexFreezeOp)

	REGISTER_CPU_OPERATOR (IndexSize, IndexSizeOp)

Max number of including the zero entry	Output (0,"handler","Pointer to an Index instance.").ScalarType(TensorProto_DataType_UNDEFINED)

Max number of including the zero entry	Output (0,"handle","Pointer to an Index instance.").ScalarType(TensorProto_DataType_UNDEFINED)

return an Int tensor of same shape containing the indices for each of the keys If the index is unknown entries are given index new entries are added into the index If an insert is necessary but max_elements has been fail DOC	Input (0,"handle","Pointer to an Index instance.").Input(1

return an Int tensor of same shape containing the indices for each of the keys If the index is unknown entries are given index new entries are added into the index If an insert is necessary but max_elements has been fail DOC Tensor of keys to be looked up Output(0,"indices","Indices for each of the keys.").ScalarType(TensorProto disallowing creation of new index entries Should not be called concurrently with IndexGet DOC The input handle	EnforceInplace ({{0, 0}}).ScalarType(TensorProto_DataType_UNDEFINED)

Pointer to an Index instance	Input (1,"items","1-D tensor with elements starting with index 1.").Output(0

Pointer to an Index instance The input handle If skips the first entry of the tensor This allows to load tensors that are aligned with an where the first entry corresponds to the default index entry	ScalarType (TensorProto_DataType_UNDEFINED)

Pointer to an Index instance	Output (0,"items","Scalar int64 tensor with number of entries.")

	NO_GRADIENT (IndexGetOp)

	NO_GRADIENT (IntIndexCreate)

	NO_GRADIENT (LongIndexCreate)

	NO_GRADIENT (StringIndexCreate)

	SHOULD_NOT_DO_GRADIENT (IndexFreeze)

	SHOULD_NOT_DO_GRADIENT (IndexLoad)

	SHOULD_NOT_DO_GRADIENT (IndexStore)

	SHOULD_NOT_DO_GRADIENT (IndexSize)

	CAFFE_KNOWN_TYPE (std::unique_ptr< caffe2::IndexBase >)

	REGISTER_BLOB_SERIALIZER ((TypeMeta::Id< std::unique_ptr< caffe2::IndexBase >>()), IndexSerializer)

	REGISTER_BLOB_DESERIALIZER (std::unique_ptr< caffe2::IndexBase >, IndexDeserializer)

	REGISTER_CPU_OPERATOR (InstanceNormGradient, InstanceNormGradientOp< float, CPUContext >)

	OPERATOR_SCHEMA (InstanceNormGradient).NumInputs(4

	NumOutputs (3)

	REGISTER_GRADIENT (InstanceNorm, GetInstanceNormGradient)

	REGISTER_CPU_OPERATOR (InstanceNorm, InstanceNormOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (IntegralImage, IntegralImageOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (IntegralImageGradient, IntegralImageGradientOp< float, CPUContext >)

which contains the sum of pixel values within an image vertically and horizontally This integral image can then be used with other detection and tracking techniques DOC	Input (0,"X","Images tensor of the form (N, C, H, W)").Output(0

which contains the sum of pixel values within an image vertically and horizontally This integral image can then be used with other detection and tracking techniques DOC Integrated image of the	form (N, C, H+1, W+1)")

	OPERATOR_SCHEMA (IntegralImageGradient).NumInputs(2).NumOutputs(1)

	REGISTER_GRADIENT (IntegralImage, GetIntegralImageGradient)

	REGISTER_CPU_OPERATOR (IsEmpty, IsEmptyOp< CPUContext >)

	REGISTER_CPU_OPERATOR (BernoulliJSD, BernoulliJSDOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (BernoulliJSDGradient, BernoulliJSDGradientOp< float, CPUContext >)

array of probabilities for prediction	Input (0,"T","array of probabilities for target").Output(0

	OPERATOR_SCHEMA (BernoulliJSDGradient).NumInputs(3).NumOutputs(1)

	REGISTER_GRADIENT (BernoulliJSD, GetBernoulliJSDGradient)

	REGISTER_CPU_OPERATOR (KeySplit, KeySplitOp< int64_t, CPUContext >)

	NO_GRADIENT (KeySplitOp)

	OPERATOR_SCHEMA (KeySplit).NumInputs(1).NumOutputs(1

	REGISTER_CPU_OPERATOR (LayerNorm, LayerNormOp< CPUContext >)

	OPERATOR_SCHEMA (LayerNormGradient).NumInputs(5).NumOutputs(1)

	REGISTER_CPU_OPERATOR (LayerNormGradient, LayerNormGradientOp< CPUContext >)

	REGISTER_GRADIENT (LayerNorm, GetLayerNormGradient)

std::vector< int >	input_dims (input_dims_long.begin(), input_dims_long.end())

ArgumentHelper	helper (def)

std::vector< int >	stat_dims (input_dims.begin(), input_dims.begin()+canonical_axis)

stat_dims	push_back (1)

	SetDoc (R"DOC( Computes layer normalization as described in https://arxiv.org/pdf/1607.06450.pdf. Given an input vector x \in [a_0, a_1, ...,a_{k-1}, a_k, ..., a_{n-1}], this op treats dimensions a_k through a_{n-1} as feature vectors. For each feature vector, the op contains the mean and standard deviation. Then, it returns the normalized values (with respect to the feature vector). Note that this op does not contain the scale an bias terms described in the paper. Simply follow this op with an FC op to add those. Concretely, this op implements: h = \frac{1}{\sigma}(a - \mu) where \mu = \frac{1}{H}\sum_{i=1}^{H} a_i and \sigma = \sqrt{\frac{1}{H}\sum_{i=1}^{H}(a_i - \mu)^2} where H is the number of hidden units (i.e. product of dimensions from 'axis' to the end.) )DOC").Arg("axis"

Describes axis of the inputs Defaults to one because the axis most likely describes the batch size	Arg ("epsilon","(float) default to 0.001. Small value to be added to the stdev when"" dividing out by that value. This prevents division by zero.").Input(0

Describes axis of the inputs Defaults to one because the axis most likely describes the batch size Input tensor which layer normalization will be applied to	Output (0,"output","Normalized values").Output(1

Describes axis of the inputs Defaults to one because the axis most likely describes the batch size Input tensor which layer normalization will be applied to Mean values for each feature vector	Output (2,"stddev","Standard deviations for each feature vector")

	REGISTER_CPU_OPERATOR (LeakyRelu, LeakyReluOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (LeakyReluGradient, LeakyReluGradientOp< float, CPUContext >)

default	CostInferenceFunction (PointwiseCostInference< 2 >).IdenticalTypeAndShape().SetDoc(R"DOC( The LeakyRelu op takes one input tensor $X$ and an argument $alpha$

	Arg ("alpha","Coefficient of leakage").InheritOnnxSchema()

	REGISTER_GRADIENT (LeakyRelu, GetLeakyReluGradient)

	REGISTER_CPU_OPERATOR (LengthsSplit, LengthsSplitOp< CPUContext >)

NumInputs(1, 2).NumOutputs(1).ScalarType(TensorProto	GRADIENT_NOT_IMPLEMENTED_YET (LengthsSplit)

	REGISTER_CPU_OPERATOR (LengthsPad, LengthsPadOp< CPUContext >)

	REGISTER_CPU_OPERATOR (SparseLengthsSumFused8BitRowwise, SparseLengthsFused8BitRowwiseOp< CPUContext >)

NumInputs(3).NumOutputs(1).ValueKeyLengthInputFillers(SparseLengthsFused8BitRowwiseOp< CPUContext >	NO_GRADIENT (SparseLengthsSumFused8BitRowwise)

	REGISTER_CPU_OPERATOR (SparseLengthsWeightedSumFused8BitRowwise, SparseLengthsFused8BitRowwiseOp< CPUContext, true >)

true SparseLengthsFused8BitRowwiseOp< CPUContext, true >::WEIGHTS	SetDoc (R"DOC( Performs the same operation as SparseLengthsWeightedSum, but operating on 8-bit rowwise quantized matrices with fused storage (where each row stores quantized values, and then 4-byte scale and 4-byte bias). )DOC").Input(0

true SparseLengthsFused8BitRowwiseOp< CPUContext, true >::WEIGHTS uint8 tensor obtained with	operator FloatToFused8BitRowwiseQuantized") .Input (1,"INDICES","Integer vector containing indices of the first ""dimension of DATA for the slices that are being aggregated").Input(2

true SparseLengthsFused8BitRowwiseOp< CPUContext, true >::WEIGHTS uint8 tensor obtained with Vector with the same sum of elements as the first dimension of DATA	Input (3,"WEIGHTS","Vector of weights to scale rows of DATA with before reduction").Output(0

	NO_GRADIENT (SparseLengthsWeightedSumFused8BitRowwise)

	REGISTER_CPU_OPERATOR (SparseLengthsMeanFused8BitRowwise, SparseLengthsFused8BitRowwiseOp< CPUContext, false, true >)

true SparseLengthsFused8BitRowwiseOp< CPUContext, false, true >::LENGTHS	SetDoc (R"DOC( Performs the same operation as SparseLengthsMean, but operating on 8-bit rowwise quantized matrices with fused storage (where each row stores quantized values, and then 4-byte scale and 4-byte bias). )DOC").Input(0

true SparseLengthsFused8BitRowwiseOp< CPUContext, false, true >::LENGTHS uint8 tensor obtained with Vector with the same sum of elements as the first dimension of DATA	Output (0,"output","output")

	NO_GRADIENT (SparseLengthsMeanFused8BitRowwise)

	REGISTER_CPU_OPERATOR (SparseLengthsSum, SparseLengthsSumOp)

	REGISTER_CPU_OPERATOR (SparseLengthsWeightedSum, SparseLengthsWeightedSumOp)

	REGISTER_CPU_OPERATOR (SparseLengthsMean, SparseLengthsMeanOp)

for each weights are accessed by where L is the length of given row This is basically a fused	operator of LengthsRangeFill+Gather+SparseWeightedSum) DOC") .Input (0,"DATA","uint8 tensor obtained with ""operator FloatToRowwiseQuantized8Bits").Input(1

for each weights are accessed by where L is the length of given row This is basically a fused Scalar multipliers for the input slices Must be a vector with the length matching the length of DATA	Input (2,"INDICES","Integer vector containing indices of the first ""dimension of DATA for the slices that are being aggregated").Input(3

	REGISTER_CPU_OPERATOR_STR ("SparseLengthsPositionalWeightedSum", CPUSparseLengthsReductionOp< float, TensorTypes< float, at::Half >, 1, 0, 1 >)

template<typename Def >
string	FormatDoc ()

NumInputs(SparseLengthsSumDef::ForwardOp::kNumInputs).NumOutputs(1).ValueKeyLengthInputFillers(SparseLengthsSumOp	REGISTER_CPU_OPERATOR (SparseLengthsSumGradient, SparseLengthsSumDef::BackwardOp)

	NumInputs (SparseLengthsSumDef::BackwardOp::kNumInputs).NumOutputs(1).DisallowInputFillers()

NumInputs(SparseLengthsWeightedSumDef::ForwardOp::kNumInputs).NumOutputs(1).WeightedValueKeyLengthInputFillers(SparseLengthsWeightedSumOp	REGISTER_CPU_OPERATOR (SparseLengthsWeightedSumGradient, SparseLengthsWeightedSumDef::BackwardOp)

	NumInputs (SparseLengthsWeightedSumDef::BackwardOp::kNumInputs).NumOutputs(1).DisallowInputFillers()

	REGISTER_GRADIENT (SparseLengthsWeightedSum, SparseLengthsWeightedSumDef::GetGradient) using SparseLengthsMeanDef

NumInputs(SparseLengthsMeanDef::ForwardOp::kNumInputs).NumOutputs(1).ValueKeyLengthInputFillers(SparseLengthsMeanOp	REGISTER_CPU_OPERATOR (SparseLengthsMeanGradient, SparseLengthsMeanDef::BackwardOp)

	NumInputs (SparseLengthsMeanDef::BackwardOp::kNumInputs).NumOutputs(1).DisallowInputFillers()

	REGISTER_CPU_OPERATOR (Rowwise8BitQuantizedToFloat, Rowwise8BitQuantizedToFloatOp< CPUContext >)

	REGISTER_CPU_OPERATOR (FloatToRowwiseQuantized8Bits, FloatToRowwiseQuantized8BitsOp< CPUContext >)

	REGISTER_CPU_OPERATOR (SparseLengthsSum8BitsRowwise, SparseLengths8BitsRowwiseOp< CPUContext >)

	REGISTER_CPU_OPERATOR (SparseLengthsWeightedSum8BitsRowwise, SparseLengths8BitsRowwiseOp< CPUContext, 1 >)

	REGISTER_CPU_OPERATOR (SparseLengthsMean8BitsRowwise, SparseLengths8BitsRowwiseOp< CPUContext, 0, 1 >)

	REGISTER_CPU_OPERATOR (SparseLengthsWeightedMean8BitsRowwise, SparseLengths8BitsRowwiseOp< CPUContext, 1, 1 >)

NumInputs(4).NumOutputs(1).ValueLengthInputFillers(SparseLengths8BitsRowwiseOp< CPUContext >	NumInputs (5).NumOutputs(1).ValueLengthInputFillers(SparseLengths8BitsRowwiseOp< CPUContext

NumInputs(4).NumOutputs(1).ValueLengthInputFillers(SparseLengths8BitsRowwiseOp< CPUContext > SparseLengths8BitsRowwiseOp< CPUContext, 1 >::LENGTHS	SetDoc (R"DOC( Variation of SparseLengthsWeightedSum operator, where DATA is stored using 8bits. DATA was quantized with 8Bit row-wise quantization (see doc to FloatToRowwiseQuantized8Bits operator). To restore DATA from 8Bit, we use additional input that stores scales and biases. )DOC").Input(0

NumInputs(4).NumOutputs(1).ValueLengthInputFillers(SparseLengths8BitsRowwiseOp< CPUContext > SparseLengths8BitsRowwiseOp< CPUContext, 1 >::LENGTHS uint8 tensor obtained with	operator FloatToRowwiseQuantized8Bits") .Input (1,"SCALARS","Scalar multipliers for the input slices. Must ""be a vector with the length matching the length of INDICES").Input(2

NumInputs(4).NumOutputs(1).ValueLengthInputFillers(SparseLengths8BitsRowwiseOp< CPUContext > SparseLengths8BitsRowwiseOp< CPUContext, 1 >::LENGTHS uint8 tensor obtained with Integer vector containing indices of the first dimension of DATA for the slices that are being aggregated	Input (3,"LENGTHS","Vector with the same sum of elements as the first dimension of DATA").Input(4

SparseLengths8BitsRowwiseOp< CPUContext, 1, 1 >::LENGTHS	SetDoc (R"DOC( Variation of SparseLengthsWeightedMean operator, where DATA is stored using 8bits. DATA was quantized with 8Bit row-wise quantization (see doc to FloatToRowwiseQuantized8Bits operator). To restore DATA from 8Bit, we use additional input that stores scales and biases. )DOC").Input(0

NumInputs(1).NumOutputs(2).ValueLengthInputFillers(SparseLengths8BitsRowwiseOp< CPUContext > NumInputs(2).NumOutputs(1).ValueLengthInputFillers(SparseLengths8BitsRowwiseOp< CPUContext >	NO_GRADIENT (Rowwise8BitQuantizedToFloat)

	NO_GRADIENT (FloatToRowwiseQuantized8Bits)

	NO_GRADIENT (SparseLengthsSum8BitsRowwise)

	NO_GRADIENT (SparseLengthsWeightedSum8BitsRowwise)

	NO_GRADIENT (SparseLengthsMean8BitsRowwise)

	NO_GRADIENT (SparseLengthsWeightedMean8BitsRowwise)

	REGISTER_CPU_OPERATOR (LengthsTile, LengthsTileOp< CPUContext >)

	REGISTER_CPU_OPERATOR (LengthsTopK, LengthsTopKOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (LengthsTopKGradient, LengthsTopKGradientOp< float, CPUContext >)

where segments are defined by their and concatenate them in an output tensor of the output value will be padded and the corresponding output indices will be padded by DOC	Input (0,"DATA","Tensor of rank 1. First dimension must be equal to the sum of ""lengths").Input(1

where segments are defined by their and concatenate them in an output tensor of the output value will be padded and the corresponding output indices will be padded by DOC Tensor of int32 lengths of rank	Output (0,"TopKValue","Output top k elements for each segment, with""shape=(SIZE(lengths), k)").Output(1

where segments are defined by their and concatenate them in an output tensor of the output value will be padded and the corresponding output indices will be padded by DOC Tensor of int32 lengths of rank Output indices in DATA corresponding to value in TopKValue	Arg ("k","the number of top values to return for each segment, if the number ""of values is smaller than k, the values would be padded with 0 and ""indices would be padded with -1.")

	OPERATOR_SCHEMA (LengthsTopKGradient).NumInputs(3).NumOutputs(1)

	REGISTER_GRADIENT (LengthsTopK, GetLengthsTopKGradient)

	REGISTER_CPU_OPERATOR (DBExists, DBExistsOp< CPUContext >)

	REGISTER_CPU_OPERATOR (Load, LoadOp< CPUContext >)

	REGISTER_CPU_OPERATOR (Save, SaveOp< CPUContext >)

	REGISTER_CPU_OPERATOR (Checkpoint, CheckpointOp< CPUContext >)

	REGISTER_CPU_OPERATOR (Snapshot, CheckpointOp< CPUContext >)

NumInputs(0).NumOutputs(1).SetDoc(R"DOC( Checks if the db described by the arguments exists. Github Links see the absolute_path arg details for options regarding the current root folder of the workspace	Arg ("db_type","(type: string) Type of db to save (options: \"lmdb\", ""\"leveldb\", \"minidb\").")

	NumInputs (0, INT_MAX).NumOutputs(0

INT_MAX	SetDoc (R"DOC( The Load operator loads a set of serialized blobs from a db or multiple dbs. It takes $[0, \infty)$ number of inputs and $[0, \infty)$ number of outputs, using the db keys to match the db entries with the outputs. If at least one input is passed, then it is assumed that that input blobs are a set of DBReaders to load from. Otherwise the `db` or `dbs` argument is used to load blobs from one single db or multiple dbs respectively. `db_type` argument is used to specify the type of the input db/dbs. Github Links: - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/load_save_op.cc <details> <summary> <b>Example</b> </summary> Code ``` workspace.ResetWorkspace() op = core.CreateOperator( "Load", [], ["X", "Y"], db="test_db", db_type="lmdb" ) workspace.RunOperatorOnce(op) print("X:", workspace.FetchBlob("X")) print("Y:", workspace.FetchBlob("Y")) ``` </details> )DOC").Input(0

default save the db directly to the path specified by the db arg If not	set (default)

default save the db directly to the path specified by the db arg If not prepend the path of the current root folder of the workspace to the path specified by the db arg	Arg ("add_prefix","(type: string, default: \"\") Blobs will be prefixed with this when ""loading. Useful for avoiding collisions with blobs existing in the ""workspace. The output blob names specified to this op should include ""this prefix.").Arg("strip_prefix"

default save the db directly to the path specified by the db arg If not prepend the path of the current root folder of the workspace to the path specified by the db arg characters that precede strip_prefix will be removed Useful for removing device scope from blob names	Arg ("db","(type: string) The output path of the db. See the ""`absolute_path` arg details for options regarding the current root folder ""of the workspace.").Arg("dbs"

default save the db directly to the path specified by the db arg If not prepend the path of the current root folder of the workspace to the path specified by the db arg characters that precede strip_prefix will be removed Useful for removing device scope from blob names minidb	Arg ("keep_device","(type: int; default: 0) If nonzero, the blobs are loaded into the ""device that is specified in the serialized `BlobProto`. Otherwise, ""the device will be set as the one that the `Load` operator is being ""run under.").Arg("load_all"

default will load all blobs pointed to by the db to the workspace overwriting creating blobs as needed	Arg ("allow_incomplete","(type: bool; default: False) If True, will allow not loading all ""the output blobs specified in the outputs.").Arg("source_blob_names"

default save the db directly to the path specified by the db arg If not prepend the path of the current root folder of the workspace to the path specified by the db arg	Arg ("strip_prefix","(type: string, default: \"\") Characters in the provided blob names ""that match `strip_prefix` will be removed prior to saving. Also, ""characters that precede `strip_prefix` will be removed. Useful for ""removing device scope from blob names.").Arg("blob_name_overrides"

default save the db directly to the path specified by the db arg If not prepend the path of the current root folder of the workspace to the path specified by the db arg *	List (string))*If set

default save the db directly to the path specified by the db arg If not prepend the path of the current root folder of the workspace to the path specified by the db arg used as blob names instead of original blob names Must be same length as number of blobs minidb	Arg ("chunk_size","(type: string; default: kDefaultChunkSize) The chunk ""size to split tensor data into. If not set, caffe2_tensor_chunk_size will ""be used").Input(0

template<typename... Ts>
string	FormatString (const string &pattern, Ts...values)

	REGISTER_CUDA_OPERATOR (Load, LoadOp< CUDAContext >)

	REGISTER_CUDA_OPERATOR (Save, SaveOp< CUDAContext >)

	REGISTER_CUDA_OPERATOR (Checkpoint, CheckpointOp< CUDAContext >)

	REGISTER_CPU_OPERATOR (LRN, LRNOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (LRNGradient, LRNGradientOp< float, CPUContext >)

	SetDoc (R"DOC( `LRN` applies Local Response Normalization to an input blob. This operation performs a kind of "lateral inhibition" by normalizing over local input regions, where normalization is applied across channels. This operator is typically used to normalize an unbounded activation (such as ReLU). The output shape is the same as the input shape. The `brew` module has a wrapper for this operator for use in a `ModelHelper` object. The formula for LRN is as follows: $$b_{c} = a_{c}(bias + \frac{\alpha}{n}\sum_{c'=max(0,c-n/2)}^{min(N-1,c+n/2)} a_{c'}^2 )^{-\beta}$$ Github Links: - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/local_response_normalization_op.h - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/local_response_normalization_op.cc <details> <summary> <b>Example</b> </summary> Code ``` workspace.ResetWorkspace() op = core.CreateOperator("LRN", ["X"], ["Y", "Y_scale"], size=11, alpha=0.001, beta=0.5, bias=2.0, order="NHWC" ) workspace.FeedBlob("X", np.random.randn(1, 6, 6, 1).astype(np.float32)) // NCHW print("X:\n", workspace.FetchBlob("X"), "\n") workspace.RunOperatorOnce(op) print("Y:\n", workspace.FetchBlob("Y")) print("Y_scale:\n", workspace.FetchBlob("Y_scale")) ``` Result ``` X: [[[[ 0.72985137] [-0.3753357 ] [ 2.7344604 ] [-0.5937792 ] [ 0.38440478] [-2.1659644 ]] [[-0.92846817] [-0.9996144 ] [ 0.212943 ] [-1.968045 ] [-0.77839696] [ 0.45492038]] [[-0.11263168] [ 1.9901097 ] [ 0.19275683] [ 0.15630436] [ 0.7536298 ] [-0.77339894]] [[ 0.8353551 ] [-0.7784452 ] [ 1.779317 ] [ 0.22421335] [ 1.3846219 ] [-3.0546608 ]] [[ 0.09977621] [ 2.2071757 ] [ 0.79971045] [ 3.563886 ] [-0.7169287 ] [ 0.77170426]] [[-1.4296649 ] [ 0.19181213] [ 0.45961624] [-1.0201577 ] [ 0.62854475] [-0.6395456 ]]]] Y: [[[[ 0.5160766 ] [-0.26540157] [ 1.9332271 ] [-0.41986194] [ 0.27181432] [-1.5314047 ]] [[-0.6565133 ] [-0.7068181 ] [ 0.15057328] [-1.3914955 ] [-0.5504022 ] [ 0.32167578]] [[-0.0796426 ] [ 1.4070934 ] [ 0.13629955] [ 0.11052381] [ 0.53288984] [-0.5468682 ]] [[ 0.5906759 ] [-0.5504363 ] [ 1.2580767 ] [ 0.1585426 ] [ 0.9790328 ] [-2.1595135 ]] [[ 0.07055242] [ 1.5605361 ] [ 0.5654725 ] [ 2.5193207 ] [-0.50693923] [ 0.54567 ]] [[-1.0108787 ] [ 0.13563155] [ 0.3249962 ] [-0.72134334] [ 0.44444424] [-0.45222285]]]] Y_scale: [[[[2.0000484] [2.0000129] [2.0006797] [2.000032 ] [2.0000134] [2.0004265]] [[2.0000784] [2.0000908] [2.000004 ] [2.0003521] [2.000055 ] [2.0000188]] [[2.0000012] [2.00036 ] [2.0000033] [2.0000021] [2.0000517] [2.0000544]] [[2.0000634] [2.000055 ] [2.0002878] [2.0000045] [2.0001743] [2.0008483]] [[2.000001 ] [2.000443 ] [2.0000582] [2.0011547] [2.0000467] [2.0000541]] [[2.0001857] [2.0000033] [2.0000193] [2.0000947] [2.000036 ] [2.0000372]]]] ``` </details> )DOC").Arg("size"

	OPERATOR_SCHEMA (LRNGradient).NumInputs(3).NumOutputs(1)

	REGISTER_GRADIENT (LRN, GetLRNGradient)

	REGISTER_CPU_OPERATOR (LC, LocallyConnectedOp< float, CPUContext >)

NumInputs(2, 3).NumOutputs(1).TensorInferenceFunction(ConvPoolOpBase< CPUContext >	REGISTER_CPU_OPERATOR (LC1D, LocallyConnectedOp< float, CPUContext >)

NumInputs(2, 3).NumOutputs(1).TensorInferenceFunction(ConvPoolOpBase< CPUContext >	REGISTER_CPU_OPERATOR (LC2D, LocallyConnectedOp< float, CPUContext >)

NumInputs(2, 3).NumOutputs(1).TensorInferenceFunction(ConvPoolOpBase< CPUContext >	REGISTER_CPU_OPERATOR (LC3D, LocallyConnectedOp< float, CPUContext >)

NumInputs(2, 3).NumOutputs(1).TensorInferenceFunction(ConvPoolOpBase< CPUContext >	REGISTER_CPU_OPERATOR (LCGradient, LocallyConnectedGradientOp< float, CPUContext >)

	OPERATOR_SCHEMA (LCGradient).NumInputs(2

	REGISTER_CPU_OPERATOR (LC1DGradient, LocallyConnectedGradientOp< float, CPUContext >)

	OPERATOR_SCHEMA (LC1DGradient).NumInputs(2

	REGISTER_CPU_OPERATOR (LC2DGradient, LocallyConnectedGradientOp< float, CPUContext >)

	OPERATOR_SCHEMA (LC2DGradient).NumInputs(2

	REGISTER_CPU_OPERATOR (LC3DGradient, LocallyConnectedGradientOp< float, CPUContext >)

	OPERATOR_SCHEMA (LC3DGradient).NumInputs(2

	REGISTER_GRADIENT (LC, GetLocallyConnectedGradient)

	REGISTER_GRADIENT (LC1D, GetLocallyConnectedGradient)

	REGISTER_GRADIENT (LC2D, GetLocallyConnectedGradient)

	REGISTER_GRADIENT (LC3D, GetLocallyConnectedGradient)

	REGISTER_CUDA_OPERATOR (LC, LocallyConnectedOp< float, CUDAContext >)

	REGISTER_CUDA_OPERATOR (LCGradient, LocallyConnectedGradientOp< float, CUDAContext >)

	REGISTER_CUDA_OPERATOR (LC1D, LocallyConnectedOp< float, CUDAContext >)

	REGISTER_CUDA_OPERATOR (LC1DGradient, LocallyConnectedGradientOp< float, CUDAContext >)

	REGISTER_CUDA_OPERATOR (LC2D, LocallyConnectedOp< float, CUDAContext >)

	REGISTER_CUDA_OPERATOR (LC2DGradient, LocallyConnectedGradientOp< float, CUDAContext >)

	REGISTER_CUDA_OPERATOR (LC3D, LocallyConnectedOp< float, CUDAContext >)

	REGISTER_CUDA_OPERATOR (LC3DGradient, LocallyConnectedGradientOp< float, CUDAContext >)

	REGISTER_CPU_OPERATOR (Log, UnaryElementwiseOp< TensorTypes< float >, CPUContext, LogFunctor< CPUContext >>)

	REGISTER_GRADIENT (Log, GetLogGradient)

	REGISTER_CUDA_OPERATOR (Log, UnaryElementwiseOp< TensorTypes< float >, CUDAContext, LogFunctor< CUDAContext >>)

	REGISTER_CPU_OPERATOR (Logit, UnaryElementwiseWithArgsOp< TensorTypes< float >, CPUContext, LogitFunctor< CPUContext >>)

	REGISTER_CPU_OPERATOR (LogitGradient, LogitGradientOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (AveragedLoss, AveragedLoss< float, CPUContext >)

	REGISTER_CPU_OPERATOR (AveragedLossGradient, AveragedLossGradient< float, CPUContext >)

NumInputs(1).NumOutputs(1).ScalarType(TensorProto	OPERATOR_SCHEMA (AveragedLossGradient).NumInputs(2).NumOutputs(1)

	REGISTER_GRADIENT (AveragedLoss, GetAveragedLossGradient)

	REGISTER_CPU_OPERATOR (LpPool, PoolOp< float, CPUContext, LpPoolFunctor >)

	REGISTER_CPU_OPERATOR (LpPoolGradient, PoolGradientOp< float, CPUContext, LpPoolFunctor >)

	OPERATOR_SCHEMA (LpPoolGradient).NumInputs(3).NumOutputs(1)

	REGISTER_GRADIENT (LpPool, GetPoolGradient)

	REGISTER_CPU_OPERATOR (LSTMUnit, LSTMUnitOp< CPUContext >)

	NumInputs (4, 5).NumOutputs(2).SetDoc(R"DOC( LSTMUnit computes the activations of a standard LSTM (without peephole connections)

in a sequence length aware fashion given the previous cell and the sequence computes the LSTM avoiding computation if the input is	invalid (as in, the value at X{t][n] >=seqLengths[n].) DOC") .Arg("forget_bias"

	REGISTER_CPU_OPERATOR (LSTMUnitGradient, LSTMUnitGradientOp< CPUContext >)

	NumInputs (8, 9).NumOutputs(3).Arg("sequence_lengths"

	REGISTER_GRADIENT (LSTMUnit, GetLSTMUnitGradient)

	CAFFE_KNOWN_TYPE (MapType64To64)

	REGISTER_CPU_OPERATOR (MarginRankingCriterion, MarginRankingCriterionOp< CPUContext >)

	REGISTER_CPU_OPERATOR (MarginRankingCriterionGradient, MarginRankingCriterionGradientOp< CPUContext >)

	X2 (Tensor)

and label	Y (Tensor) to produce the loss(Tensor) where the loss function

and label	loss (X1, X2, Y)

	REGISTER_CPU_OPERATOR (MatMul, MatMulOp< float, CPUContext >)

out[0]	set_data_type (in[0].data_type())

	if (trans_a)

	if (trans_b)

out[0]	add_dims (M)

out[0]	add_dims (N)

	SetDoc (R"DOC( Matrix multiplication $Y = A * B$, where `A` has size (M x K), `B` has size (K x N), and `Y` will have a size (M x N). To transpose `A` or `B` before multiplication, pass 1 to the `trans_a` and/or `trans_b` arguments, which separate the first and second dimensions of the respective matrices using `axis_a` and `axis_b`. Github Links: - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/matmul_op.cc <details> <summary> <b>Example</b> </summary> Code ``` workspace.ResetWorkspace() op = core.CreateOperator( "MatMul", ["A", "B"], ["Y"], ) workspace.FeedBlob("A", np.random.randint(10, size=(3,3)).astype(np.float32)) workspace.FeedBlob("B", np.random.randint(10, size=(3,3)).astype(np.float32)) print("A:", workspace.FetchBlob("A")) print("B:", workspace.FetchBlob("B")) workspace.RunOperatorOnce(op) print("Y:", workspace.FetchBlob("Y")) ``` Result ``` A: [[1. 8. 3.] [6. 4. 4.] [5. 4. 7.]] B: [[4. 0. 3.] [3. 1. 1.] [8. 5. 8.]] Y: [[52. 23. 35.] [68. 24. 54.] [88. 39. 75.]] ``` </details> )DOC").Input(0

	REGISTER_GRADIENT (MatMul, GetMatMulGradient)

	REGISTER_CUDA_OPERATOR (MatMul, MatMulOp< float, CUDAContext >)

	REGISTER_CPU_OPERATOR (Mean, MeanOp< CPUContext >)

	REGISTER_CPU_OPERATOR (MeanGradient, MeanGradientOp< CPUContext >)

	SetDoc (R"DOC( Element-wise mean of an arbitrary number of input tensors. This operation can be performed in-place, by using the first input blob as the output blob. All inputs must have the same shape and data type, and the output will have the same shape as the inputs. Github Link: - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/mean_op.cc <details> <summary> <b>Example</b> </summary> Code ``` workspace.ResetWorkspace() op = core.CreateOperator( "Mean", ["X", "Y", "Z"], ["X"], ) workspace.FeedBlob("X", (np.random.rand(3,3)).astype(np.float32)) workspace.FeedBlob("Y", (np.random.rand(3,3)).astype(np.float32)) workspace.FeedBlob("Z", (np.random.rand(3,3)).astype(np.float32)) print("X:", workspace.FetchBlob("X")) print("Y:", workspace.FetchBlob("Y")) print("Z:", workspace.FetchBlob("Z")) workspace.RunOperatorOnce(op) print("Mean:", workspace.FetchBlob("X")) ``` Result ``` X: [[0.6035237 0.5305746 0.6298913 ] [0.9169737 0.01280353 0.16286302] [0.6017664 0.9946255 0.05128575]] Y: [[0.07544111 0.45371833 0.08460239] [0.9708728 0.7422064 0.7933344 ] [0.97671497 0.3411384 0.73818344]] Z: [[0.08837954 0.90187573 0.46734726] [0.6308827 0.8719029 0.39888734] [0.90059936 0.92883426 0.5695987 ]] Mean: [[0.25578147 0.6287229 0.39394698] [0.8395764 0.5423043 0.45169494] [0.8263602 0.75486606 0.45302266]] ``` </details> )DOC").Input(0

	REGISTER_GRADIENT (Mean, GetMeanGradient)

	REGISTER_CPU_OPERATOR (MaxGradient, MaxGradientOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (MinGradient, MinGradientOp< float, CPUContext >)

	OPERATOR_SCHEMA (MaxGradient).NumInputs(3

INT_MAX	NumOutputs (1, INT_MAX)

	OPERATOR_SCHEMA (MinGradient).NumInputs(3

	REGISTER_GRADIENT (Max, GetMaxGradient)

	REGISTER_GRADIENT (Min, GetMinGradient)

	REGISTER_CPU_OPERATOR (Min, MinOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (Max, MaxOp< float, CPUContext >)

	SetDoc (R"DOC( Element-wise max of an arbitrary number of input tensors. This operation can be performed in-place, by using the first input blob as the output blob. All inputs must have the same shape and data type, and the output will have the same shape as the inputs. Github Link: - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/minmax_ops.cc <details> <summary> <b>Example</b> </summary> Code ``` workspace.ResetWorkspace() op = core.CreateOperator( "Max", ["X", "Y", "Z"], ["X"], ) workspace.FeedBlob("X", (np.random.rand(3,3)).astype(np.float32)) workspace.FeedBlob("Y", (np.random.rand(3,3)).astype(np.float32)) workspace.FeedBlob("Z", (np.random.rand(3,3)).astype(np.float32)) print("X:", workspace.FetchBlob("X")) print("Y:", workspace.FetchBlob("Y")) print("Z:", workspace.FetchBlob("Z")) workspace.RunOperatorOnce(op) print("Max:", workspace.FetchBlob("X")) ``` Result ``` X: [[0.4496477 0.07061381 0.7139333 ] [0.83203 0.05970785 0.72786295] [0.75988126 0.04601283 0.32820013]] Y: [[0.05683139 0.16872478 0.671098 ] [0.70739156 0.09878621 0.03416285] [0.34087983 0.94986707 0.67263436]] Z: [[0.48051122 0.07141234 0.85264146] [0.77086854 0.22082241 0.13154659] [0.42401117 0.995431 0.4263775 ]] Max: [[0.48051122 0.16872478 0.85264146] [0.83203 0.22082241 0.72786295] [0.75988126 0.995431 0.67263436]] ``` </details> )DOC").Input(0

	SetDoc (R"DOC( Element-wise min of an arbitrary number of input tensors. This operation can be performed in-place, by using the first input blob as the output blob. All inputs must have the same shape and data type, and the output will have the same shape as the inputs. Github Link: - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/minmax_ops.cc <details> <summary> <b>Example</b> </summary> Code ``` workspace.ResetWorkspace() op = core.CreateOperator( "Min", ["X", "Y", "Z"], ["X"], ) workspace.FeedBlob("X", (np.random.rand(2,2)).astype(np.float32)) workspace.FeedBlob("Y", (np.random.rand(2,2)).astype(np.float32)) workspace.FeedBlob("Z", (np.random.rand(2,2)).astype(np.float32)) print("X:", workspace.FetchBlob("X")) print("Y:", workspace.FetchBlob("Y")) print("Z:", workspace.FetchBlob("Z")) workspace.RunOperatorOnce(op) print("Min:", workspace.FetchBlob("X")) ``` Result ``` X: [[0.32731926 0.4939747 ] [0.29242373 0.43460014]] Y: [[0.40928316 0.916115 ] [0.77526504 0.29339448]] Z: [[0.7899794 0.90335774] [0.82599413 0.2843068 ]] Min: [[0.32731926 0.4939747 ] [0.29242373 0.2843068 ]] ``` </details> )DOC").Input(0

	REGISTER_CPU_OPERATOR (Moments, MomentsOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (MomentsGradient, MomentsGradientOp< float, CPUContext >)

then the resulted tensor have the reduced dimension pruned DOC	Arg ("axes","A list of integers, along which to reduce. If axes is not provided, ""the op computes the element-wise mean and variance.").Arg("keepdims"

then the resulted tensor have the reduced dimension pruned DOC Keep the reduced	dimension (s) or not

then the resulted tensor have the reduced dimension pruned DOC Keep the reduced default True keeps the reduced An input tensor	Output (0,"mean","Reduced mean tensor.").Output(1

	OPERATOR_SCHEMA (MomentsGradient).NumInputs(4).NumOutputs(1)

	REGISTER_GRADIENT (Moments, GetMomentsGradient)

	REGISTER_CPU_OPERATOR (MultiClassAccuracy, MultiClassAccuracyOp< float, CPUContext >)

D float	tensor (N, D,) of predicted scores of each class for" "each data.N is the number of instances

D float i batch size D is number of possible classes labels	Input (1,"labels","1-D int tensor (N,) of labels for each instance.").Output(0

D float i batch size D is number of possible classes labels D float	tensor (D,) of accuracy for each class.If a class has no" "instance in the batch

D float i batch size D is number of possible classes labels D float its accuracy score is set to zero	Output (1,"amounts","1-D int tensor (D,) of number of instances for each class in the batch.")

	SHOULD_NOT_DO_GRADIENT (MultiClassAccuracy)

	REGISTER_CPU_OPERATOR (NegateGradient, NegateGradientOp< CPUContext >)

	SetDoc (R"DOC( NegagteGradient operator in forward pass simply copies input to the output, and in backward pass, flips the sign of the output gradient )DOC")

	REGISTER_GRADIENT (NegateGradient, GetNegateGradientGradient)

	REGISTER_CPU_OPERATOR (Negative, UnaryElementwiseOp< NumericTypes, CPUContext, NegativeFunctor< CPUContext >>)

	REGISTER_CUDA_OPERATOR (Negative, UnaryElementwiseOp< NumericTypes, CUDAContext, NegativeFunctor< CUDAContext >>)

	REGISTER_CPU_OPERATOR (NGramFromCategorical, NGramFromCategoricalOp< float, int64_t, CPUContext >)

	NO_GRADIENT (NGramFromCategorical)

	OPERATOR_SCHEMA (NGramFromCategorical).NumInputs(1).NumOutputs(1)

	REGISTER_CPU_OPERATOR (NormalizeL1, NormalizeL1Op< float, CPUContext >)

axis to normalize	SetDoc (R"DOC( Given a matrix, apply L1-normalization along the specified axis. )DOC")

	REGISTER_CPU_OPERATOR (Normalize, NormalizeOp< float, CPUContext >)

axis to normalize	SetDoc (R"DOC( Given a matrix, apply L2-normalization along the specified dimension. )DOC").IdenticalTypeAndShape()

	REGISTER_CPU_GRADIENT_OPERATOR (NormalizeGradient, NormalizeGradientOp< float, CPUContext >)

	REGISTER_GRADIENT (Normalize, GetNormalizeGradient)

	REGISTER_CPU_OPERATOR (NumpyTile, NumpyTileOp< CPUContext >)

The input tensor	Input (1,"repeats","1-D Tensor specifying how many times to repeat"" each axis.").Output(0

The input tensor Tensor that will contain input replicated along the given axis	InheritOnnxSchema ("Tile")

vector< TensorShape >	TensorInferenceForBatchOneHot (const OperatorDef &, const vector< TensorShape > &in)

vector< TensorShape >	TensorInferenceForBucketBatchOneHot (const OperatorDef &, const vector< TensorShape > &in)

OpSchema::Cost	CostInferenceForBatchOneHot (const OperatorDef &def, const vector< TensorShape > &in)

	REGISTER_CPU_OPERATOR (BatchBucketOneHot, BatchBucketOneHotOp< CPUContext >)

	REGISTER_CPU_OPERATOR (BatchOneHot, BatchOneHotOp< CPUContext >)

	REGISTER_CPU_OPERATOR (OneHot, OneHotOp< CPUContext >)

	REGISTER_CPU_OPERATOR (SegmentOneHot, SegmentOneHotOp)

	REGISTER_CPU_OPERATOR (ONNXWhile, ONNXWhileOp< CPUContext >)

INT_MAX	SetDoc (R"DOC( * EXPERIMENTAL. This operator is a work-in-progress. No assumption should be made about the stability or correctness of this op. * Generic Looping construct confirming to the ONNX Loop operator spec. This loop has multiple termination conditions: 1. Trip count. Iteration count specified at runtime. Set by specifying the input M. Optional. Set to empty string to omit. Note that a static trip count (specified at graph construction time) can be specified by passing in a constant node for input M. 2. Loop termination condition. This is an input to the op that determines whether to run the first interation and also a loop-carried dependency for the body graph. The body graph must yield a value for the condition variable, whether this input is provided or not. This table summarizes the operating modes of this operator with equivalent C-style code: Operator inputs defined as (max_trip_count, condition_var). Omitted optional inputs are represented as empty string. Concretely, in this caffe2 op an input is marked as omitted by setting its 'has_{name}' argument to False. input ("", ""): for (int i=0; ; ++i) { cond = ... // Note this value is ignored, but is required in the body } input ("", cond) // Note this is analogous to a while loop bool cond = ...; for (int i=0; cond; ++i) { cond = ...; } input ("", 1) // Note this is analogous to a do-while loop bool cond = true for (int i=0; cond; ++i) { cond = ...; } input (trip_count, "") // Note this is analogous to a for loop int trip_count = ... for (int i=0; i < trip_count; ++i) { cond = ...; // ignored } input (trip_count, cond) int trip_count = ...; bool cond = ...; for (int i=0; i < trip_count && cond; ++i) { cond = ...; } )DOC").Arg("body"

INT_MAX Net executed on each iteration	Arg ("has_trip_count","Whether to use the trip count input").Arg("has_cond"

INT_MAX Net executed on each iteration Whether to use the condition input	Arg ("save_scopes","Whether to save the scopes across iterations, as in ""for backprop").Arg("disable_scopes"

INT_MAX Net executed on each iteration Whether to use the condition input Do not create new scopes Use this only if you re certain there will be no name for example if you re converting from a fully SSA IR Number of iterations to go out to Used if the flag has_trip_count is True	Input (1,"first_iter_condition","Dynamic condition value for the first ""iteration. For all subsequent iterations,"" the condition from the body graph is ""used. This input is used if the flag ""has_cond is true.").NumOutputs(0

	REGISTER_CPU_OPERATOR (Onnxifi, OnnxifiOp< float, CPUContext >)

INT_MAX	SetDoc (R"DOC( The Onnxifi operator is a black-box operator to lower the computation to Onnxifi backend )DOC").Arg("onnx_model"

	REGISTER_CPU_OPERATOR (NHWC2NCHW, NHWC2NCHWOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (NCHW2NHWC, NCHW2NHWCOp< float, CPUContext >)

out[0]	add_dims (in[0].dims(in[0].dims_size()-1))

	SetDoc (R"DOC( The operator switches the order of data in a tensor from NHWC- sample index N, height H, width H and channels C, to the NCHW order (this is for 2D images). In general, this operator switches the order of data in a tensor from N H_1 ... H_k C to N C H_1 ... H_k for k-dimensional features, and currently supports k=1, 2, and 3. )DOC").Input(0

The input	data (Tensor) in the NHWC order.") .Output(0

The input The output	tensor (Tensor) in the NCHW order.")

out[0]	add_dims (in[0].dims(1))

	SetDoc (R"DOC( The operator switches the order of data in a tensor from NCHW- sample index N, channels C, height H and width W, to the NHWC order (this is for 2D images). In general, this operator switches the order of data in a tensor from N C H_1 ... H_k to N H_1 ... H_k C for k-dimensional features, and currently supports k=1, 2, and 3. )DOC").Input(0

	REGISTER_GRADIENT (NHWC2NCHW, GetNHWC2NCHWGradient)

	REGISTER_GRADIENT (NCHW2NHWC, GetNCHW2NHWCGradient)

	REGISTER_CUDNN_OPERATOR (NHWC2NCHW, CuDNNNHWC2NCHWOp)

	REGISTER_CUDNN_OPERATOR (NCHW2NHWC, CuDNNNCHW2NHWCOp)

	REGISTER_CUDA_OPERATOR (NHWC2NCHW, NHWC2NCHWOp< float, CUDAContext >)

	REGISTER_CUDA_OPERATOR (NCHW2NHWC, NCHW2NHWCOp< float, CUDAContext >)

	REGISTER_CPU_OPERATOR (PackSegments, PackSegmentsOp< CPUContext >)

	REGISTER_CPU_OPERATOR (UnpackSegments, UnpackSegmentsOp< CPUContext >)

	SetDoc ("Map N dim tensor to N+1 dim based on length blob. Sequences that \ are shorter than the longest sequence are padded with zeros.").Input(0

d int long tensor contains the length in each of the output	Input (1,"tensor","N dim Tensor.").Output(0

d int long tensor contains the length in each of the output N dim Tensor where	dim (1) is the max length" "

d int long tensor contains the length in each of the output N dim Tensor where	dim (0) is the batch size.") .Output( 1

d int long tensor contains the length in each of the output N dim Tensor where dim boolean false where packed_tensor is true otherwise	Arg ("max_length","The pre-defined max_length for the packed segments").Arg("pad_minf"

d int long tensor contains the length in each of the output N dim Tensor where dim boolean false where packed_tensor is true otherwise Padding number in the packed segments Use true to pad otherwise pad zeros	Arg ("return_presence_mask","bool whether to return presence mask, false by default")

d int long tensor contains the length in each of the input	Input (1,"tensor","N+1 dim Tensor.").Output(0

	REGISTER_GRADIENT (PackSegments, GetPackSegmentsGradient)

	REGISTER_GRADIENT (UnpackSegments, GetUnpackSegmentsGradient)

PadMode	StringToPadMode (const string &mode)

	REGISTER_CPU_OPERATOR (PadImage, PadImageOp< float, CPUContext >)

	REGISTER_CPU_GRADIENT_OPERATOR (PadImageGradient, PadImageGradientOp< float, CPUContext >)

CPUContext::PadTensorInference	SetDoc (R"DOC( PadImage pads values around the boundary of an image according to the pad values and stride sizes defined by the ConvPoolOpBase operator. )DOC").Input(0

dimensions depend on whether the NCHW or NHWC operators are being used For in the the input has	size (N x C x H x W)

dimensions depend on whether the NCHW or NHWC operators are being used For in the the input has where N is the batch C is the number of and H and W are the height and the width of the data The corresponding permutation of dimensions is used in the latter case	Output (0,"Y","Output data tensor from padding the H and W dimensions on ""the tensor. Dimensions will vary based on various pad and stride ""sizes.")

	GRADIENT_OPERATOR_SCHEMA (PadImageGradient).NumInputs(1).NumOutputs(1)

	REGISTER_GRADIENT (PadImage, GetPadImageGradient)

	REGISTER_CPU_OPERATOR (Percentile, PercentileOp< CPUContext >)

given a sample set of raw labeled with their corresponding percentiles from the same distribution In this	operator takes as input a tensor of floats to find the percentile values for, a 2D tensor of floats, where the first column of the tensor represents sampled values, and the second column represents the percentile labels, and a tensor of integers lengths.This lengths tensor is used because the operator works on multiple sets of raw values at the same time.For example, for an input:original_values=[[3, 5, 3], [5, 1, 6]], lengths=[2, 1, 1], value_to_pct=[[3, 0.2],[5, 0.5],[1, 0.3],[3.0.6]] Our operator expects that each column i of the input tensor is sampled from distribution i.Lengths tells us that the first two elements in value_to_pct are sampled from distribution 1, the next is from distribution two, and the last is from distribution 3.We expect the output of our operator to give us[[0.2, 1.0, 0.6],[0.5, 0.3, 1.0]].To calculate the percentile of an element, we check to see if its value is already mapped to a percentile in value_to_pct.If so, we return that value.If not, we linearly interpolate between the two closest values in value_to_pct.If the value is larger than all values in value_to_pct, we return 1.If it's smaller than all the values, we return 0.) DOC") .Input (0,"original_values","Input 2D tensor of floats, representing the original, raw data to calculate percentiles for.").Input(1

given a sample set of raw labeled with their corresponding percentiles from the same distribution In this Sorted with columns Each element in the first column is a float representing the raw value of a sample Its corresponding element in the next column represents the percentile it maps to	Input (2,"lengths","1D tensor, representing the length of each distribution. We expect that the sum of elements of this tensor"" is equal to the total length of value_to_pct.").Output(0

	NO_GRADIENT (Percentile)

	REGISTER_CPU_OPERATOR (Perplexity, PerplexityOp< float, CPUContext >)

	OPERATOR_SCHEMA (Perplexity).NumInputs(1).NumOutputs(1).SetDoc(R"DOC( Perplexity calculates how well a probability distribution predicts a sample. Perplexity takes a 1-D tensor containing a batch of probabilities. Each value in the tensor belongs to a different sample and represents the probability of the model predicting the true label for that sample. The operator returns a single (float) perplexity value for the batch. )DOC").Input(0

The input data as Tensor It contains a batch of true label or target probabilities	Output (0,"output","The output- a single (float) perplexity value for the ""batch")

	SHOULD_NOT_DO_GRADIENT (Perplexity)

	REGISTER_CPU_OPERATOR (PiecewiseLinearTransform, PiecewiseLinearTransformOp< float, CPUContext >)

	NumInputs (1, 4).NumOutputs(1).SetDoc(R"DOC( PiecewiseLinearTransform takes inputs -- predictions

a D or D slopes and intercepts The output tensor has the same shape of input predictions and contains the predictions transformed by the piecewise linear functions Each column of predictions has its own piecewise linear transformation functions Therefore the size of piecewise function parameters are pieces x except for binary predictions where only the positive prediction needs them Note that in each low bound is excluded while high bound is included Also the piecewise linear function must be continuous Notes If the input is binary	predictions (Nx2 or Nx1 tensor)

a D or D slopes and intercepts The output tensor has the same shape of input predictions and contains the predictions transformed by the piecewise linear functions Each column of predictions has its own piecewise linear transformation functions Therefore the size of piecewise function parameters are pieces x except for binary predictions where only the positive prediction needs them Note that in each low bound is excluded while high bound is included Also the piecewise linear function must be continuous Notes If the input is binary set the binary arg to true so that one group of piecewise linear functions is	needed (see details below).-The transform parameters(bounds

a D or D slopes and intercepts The output tensor has the same shape of input predictions and contains the predictions transformed by the piecewise linear functions Each column of predictions has its own piecewise linear transformation functions Therefore the size of piecewise function parameters are pieces x except for binary predictions where only the positive prediction needs them Note that in each low bound is excluded while high bound is included Also the piecewise linear function must be continuous Notes If the input is binary set the binary arg to true so that one group of piecewise linear functions is intercepts can be passed either through args or through input blobs If we have multiple groups of piecewise linear each group has the same number of pieces If a prediction is out of the it is capped to the smallest or largest bound DOC	Arg ("bounds","1-D vector of size (prediction_dimensions x (pieces+1)) contain the ""upper bounds of each piece of linear function. One special case is ""the first bound is the lower bound of whole piecewise function and we ""treat it the same as the left most functions. (bounds, slopes, ""intercepts) can be passed through either arg or input blobs.").Arg("slopes"

a D or D slopes and intercepts The output tensor has the same shape of input predictions and contains the predictions transformed by the piecewise linear functions Each column of predictions has its own piecewise linear transformation functions Therefore the size of piecewise function parameters are pieces x except for binary predictions where only the positive prediction needs them Note that in each low bound is excluded while high bound is included Also the piecewise linear function must be continuous Notes If the input is binary set the binary arg to true so that one group of piecewise linear functions is intercepts can be passed either through args or through input blobs If we have multiple groups of piecewise linear each group has the same number of pieces If a prediction is out of the it is capped to the smallest or largest bound DOC D vector of	size (prediction_dimensions x pieces) containing the" "slopes of linear function") .Arg( "intercepts"

	REGISTER_CPU_OPERATOR (AveragePoolGradient, PoolGradientOp< float, CPUContext, AveragePoolFunctor< CPUContext >>)

	OPERATOR_SCHEMA (AveragePoolGradient).NumInputs(3).NumOutputs(1)

	REGISTER_CPU_OPERATOR (AveragePool1DGradient, PoolGradientOp< float, CPUContext, AveragePoolFunctor< CPUContext >>)

	OPERATOR_SCHEMA (AveragePool1DGradient).NumInputs(3).NumOutputs(1)

	REGISTER_CPU_OPERATOR (AveragePool2DGradient, PoolGradientOp< float, CPUContext, AveragePoolFunctor< CPUContext >>)

	OPERATOR_SCHEMA (AveragePool2DGradient).NumInputs(3).NumOutputs(1)

	REGISTER_CPU_OPERATOR (AveragePool3DGradient, PoolGradientOp< float, CPUContext, AveragePoolFunctor< CPUContext >>)

	OPERATOR_SCHEMA (AveragePool3DGradient).NumInputs(3).NumOutputs(1)

	REGISTER_CPU_OPERATOR (MaxPoolGradient, PoolGradientOp< float, CPUContext, MaxPoolFunctor< CPUContext >>)

	OPERATOR_SCHEMA (MaxPoolGradient).NumInputs(3).NumOutputs(1)

	REGISTER_CPU_OPERATOR (MaxPool1DGradient, PoolGradientOp< float, CPUContext, MaxPoolFunctor< CPUContext >>)

	OPERATOR_SCHEMA (MaxPool1DGradient).NumInputs(3).NumOutputs(1)

	REGISTER_CPU_OPERATOR (MaxPool2DGradient, PoolGradientOp< float, CPUContext, MaxPoolFunctor< CPUContext >>)

	OPERATOR_SCHEMA (MaxPool2DGradient).NumInputs(3).NumOutputs(1)

	REGISTER_CPU_OPERATOR (MaxPool3DGradient, PoolGradientOp< float, CPUContext, MaxPoolFunctor< CPUContext >>)

	OPERATOR_SCHEMA (MaxPool3DGradient).NumInputs(3).NumOutputs(1)

	REGISTER_GRADIENT (AveragePool, GetPoolGradient)

	REGISTER_GRADIENT (AveragePool1D, GetPoolGradient)

	REGISTER_GRADIENT (AveragePool2D, GetPoolGradient)

	REGISTER_GRADIENT (AveragePool3D, GetPoolGradient)

	REGISTER_GRADIENT (MaxPool, GetPoolGradient)

	REGISTER_GRADIENT (MaxPool1D, GetPoolGradient)

	REGISTER_GRADIENT (MaxPool2D, GetPoolGradient)

	REGISTER_GRADIENT (MaxPool3D, GetPoolGradient)

std::function< void(OpSchema &)>	AveragePoolDocGenerator (const char *dim)

std::function< void(OpSchema &)>	MaxPoolDocGenerator (const char *dim)

	REGISTER_CPU_OPERATOR (AveragePool, PoolOp< float, CPUContext, AveragePoolFunctor< CPUContext >>)

NumInputs(1).NumOutputs(1).TensorInferenceFunction(ConvPoolOpBase< CPUContext >	REGISTER_CPU_OPERATOR (AveragePool1D, PoolOp< float, CPUContext, AveragePoolFunctor< CPUContext >>)

NumInputs(1).NumOutputs(1).TensorInferenceFunction(ConvPoolOpBase< CPUContext >	REGISTER_CPU_OPERATOR (AveragePool2D, PoolOp< float, CPUContext, AveragePoolFunctor< CPUContext >>)

NumInputs(1).NumOutputs(1).TensorInferenceFunction(ConvPoolOpBase< CPUContext >	REGISTER_CPU_OPERATOR (AveragePool3D, PoolOp< float, CPUContext, AveragePoolFunctor< CPUContext >>)

NumInputs(1).NumOutputs(1).TensorInferenceFunction(ConvPoolOpBase< CPUContext >	REGISTER_CPU_OPERATOR (MaxPool, PoolOp< float, CPUContext, MaxPoolFunctor< CPUContext >>)

NumInputs(1).NumOutputs(1).TensorInferenceFunction(ConvPoolOpBase< CPUContext >	REGISTER_CPU_OPERATOR (MaxPool1D, PoolOp< float, CPUContext, MaxPoolFunctor< CPUContext >>)

NumInputs(1).NumOutputs(1).TensorInferenceFunction(ConvPoolOpBase< CPUContext >	REGISTER_CPU_OPERATOR (MaxPool2D, PoolOp< float, CPUContext, MaxPoolFunctor< CPUContext >>)

NumInputs(1).NumOutputs(1).TensorInferenceFunction(ConvPoolOpBase< CPUContext >	REGISTER_CPU_OPERATOR (MaxPool3D, PoolOp< float, CPUContext, MaxPoolFunctor< CPUContext >>)

	REGISTER_CUDNN_OPERATOR (AveragePool, CuDNNPoolOp< CuDNNAveragePoolFunctor >)

	REGISTER_CUDNN_OPERATOR (AveragePoolGradient, CuDNNPoolGradientOp< CuDNNAveragePoolFunctor >)

	REGISTER_CUDNN_OPERATOR (AveragePool1D, CuDNNPoolOp< CuDNNAveragePoolFunctor >)

	REGISTER_CUDNN_OPERATOR (AveragePool1DGradient, CuDNNPoolGradientOp< CuDNNAveragePoolFunctor >)

	REGISTER_CUDNN_OPERATOR (AveragePool2D, CuDNNPoolOp< CuDNNAveragePoolFunctor >)

	REGISTER_CUDNN_OPERATOR (AveragePool2DGradient, CuDNNPoolGradientOp< CuDNNAveragePoolFunctor >)

	REGISTER_CUDNN_OPERATOR (AveragePool3D, CuDNNPoolOp< CuDNNAveragePoolFunctor >)

	REGISTER_CUDNN_OPERATOR (AveragePool3DGradient, CuDNNPoolGradientOp< CuDNNAveragePoolFunctor >)

	REGISTER_CUDNN_OPERATOR (MaxPool, CuDNNPoolOp< CuDNNMaxPoolFunctor >)

	REGISTER_CUDNN_OPERATOR (MaxPoolGradient, CuDNNPoolGradientOp< CuDNNMaxPoolFunctor >)

	REGISTER_CUDNN_OPERATOR (MaxPool1D, CuDNNPoolOp< CuDNNMaxPoolFunctor >)

	REGISTER_CUDNN_OPERATOR (MaxPool1DGradient, CuDNNPoolGradientOp< CuDNNMaxPoolFunctor >)

	REGISTER_CUDNN_OPERATOR (MaxPool2D, CuDNNPoolOp< CuDNNMaxPoolFunctor >)

	REGISTER_CUDNN_OPERATOR (MaxPool2DGradient, CuDNNPoolGradientOp< CuDNNMaxPoolFunctor >)

	REGISTER_CUDNN_OPERATOR (MaxPool3D, CuDNNPoolOp< CuDNNMaxPoolFunctor >)

	REGISTER_CUDNN_OPERATOR (MaxPool3DGradient, CuDNNPoolGradientOp< CuDNNMaxPoolFunctor >)

	REGISTER_CPU_OPERATOR (Pow, PowOp< TensorTypes< float >, CPUContext, EigenPowFunctor, SameTypeAsInput >).NumInputs(1

	NumOutputs (1).AllowInplace(

	REGISTER_CPU_OPERATOR (PRelu, PReluOp< float, CPUContext >)

	REGISTER_CPU_GRADIENT_OPERATOR (PReluGradient, PReluGradientOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (PrependDim, PrependDimOp< CPUContext >)

	REGISTER_CPU_OPERATOR (MergeDim, MergeDimOp< CPUContext >)

	SetDoc (R"DOC( Reshape the tensor by prepending a dimension of fixed size and dividing the size of the next dimension by that amount. )DOC").Arg("dim_size"

Size of the dimension to prepend	Input (0,"data","An input tensor.").Output(0

	SetDoc (R"DOC( Merge first two dimensions in a single dimension with size dim(0) * dim(1). )DOC").Input(0

An input tensor	Output (0,"reshaped","Reshaped tensor.").InheritOnnxSchema("Reshape")

	REGISTER_GRADIENT (PrependDim, GetPrependDimGradient)

	REGISTER_CUDA_OPERATOR (PrependDim, PrependDimOp< CUDAContext >)

	REGISTER_CUDA_OPERATOR (MergeDim, MergeDimOp< CUDAContext >)

	REGISTER_CPU_OPERATOR (QuantDecode, QuantDecodeOp< QuantDecodeRunTy::RUN_ALWAYS >)

	REGISTER_CPU_GRADIENT_OPERATOR (QuantDecodeGradient, QuantDecodeGradientOp)

void	initQNNPACK ()

	REGISTER_CPU_OPERATOR (Int8Add, int8::Int8AddOp< int8::Activation::NONE >)

	REGISTER_CPU_OPERATOR (Int8AddRelu, int8::Int8AddOp< int8::Activation::RELU >)

	REGISTER_CPU_OPERATOR (Int8Sum, int8::Int8AddOp< int8::Activation::NONE >)

	REGISTER_CPU_OPERATOR (Int8SumRelu, int8::Int8AddOp< int8::Activation::RELU >)

	Arg ("Y_scale","Output tensor quantization scale").Arg("Y_zero_point"

Output tensor quantization offset	SetDoc (R"DOC( Performs element-wise binary Add (with no broadcast support). )DOC").Input(0

Output tensor quantization offset First should share the type with the second operand	Input (1,"B","Second operand. It should be of the same size as A.").Output(0

Output tensor quantization offset	SetDoc (R"DOC( Performs element-wise binary Add (with no broadcast support). ""Output will go through rectified linear ""function, where y = max(0, x). )DOC").Input(0

	NumInputs (1, std::numeric_limits< int >::max()).NumOutputs(1).AllowInplace(

Output tensor quantization scale	Arg ("Y_zero_point","Output tensor quantization offset")

	REGISTER_CPU_OPERATOR (Int8AveragePool, int8::Int8AveragePoolOp< int8::Activation::NONE >)

	REGISTER_CPU_OPERATOR (Int8AveragePoolRelu, int8::Int8AveragePoolOp< int8::Activation::RELU >)

std::function< void(OpSchema &)>	AveragePoolDocGenerator (const char *dim, bool relu_fused=false)

	REGISTER_CPU_OPERATOR (Int8ChannelShuffle, int8::Int8ChannelShuffleOp)

	REGISTER_CPU_OPERATOR (Int8Concat, int8::Int8ConcatOp)

Output tensor quantization offset	Arg ("axis","Which axis to concat on").Arg("add_axis"

Output tensor quantization offset Pass to add the axis specified in arg axis to all input tensors	TensorInferenceFunction (OpSchema::NeedsAllInputShapes(TensorInferenceForConcat)).CostInferenceFunction(CostInferenceForConcat).SetDoc("Concatenate a list of tensors into a single tensor").Output(0

Output tensor quantization offset Pass to add the axis specified in arg axis to all input tensors Concatenated tensor	Output (1,"split_info","The dimensions of the inputs.").InheritOnnxSchema("Concat")

	REGISTER_CPU_OPERATOR (Int8Conv, int8::Int8ConvOp< int8::Activation::NONE >)

std::function< void(OpSchema &)>	ConvDocGenerator (const char *dim, bool relu_fused=false)

	REGISTER_CPU_OPERATOR (Int8ConvRelu, int8::Int8ConvOp< int8::Activation::RELU >)

	REGISTER_CPU_OPERATOR (Int8ConvTranspose, int8::Int8ConvTransposeOp)

this is done throughout the image data and the output is computed As a side note on the implementation which is why they are separate files DOC	Input (0,"X","Input data blob from previous layer; has size ""(N x H x W x C), where N is the batch size, C is the number of channels, and"" H and W are the height and width. Note that NHWC is supported now").Input(1

has	size (M x kH x kW x C)

has where C is the number of and kH and kW are the height and width of the kernel	Input (2,"bias","The 1D bias blob that is added through the convolution;""has size (C). Optional, if not passed, will treat it as all 0.").Output(0

	REGISTER_CPU_OPERATOR (Int8Dequantize, int8::Int8DequantizeOp)

Int8 Tensor qX	Output (0,"Y","FP32 Tensor that represents mapped real value of qX.")

	REGISTER_CPU_OPERATOR (Int8FC, int8::Int8FCOp)

	REGISTER_CPU_OPERATOR (Int8Flatten, int8::Int8FlattenOp)

d_n then the output will have	shape (d_0 X d_1...d_(axis-1), d_axis X d_(axis+1)...X dn)) DOC") .Input(0

d_n then the output will have A Int8 tensor of with input dimensions up to axis flattened to the outer dimension of the output and remaining input dimensions flattened into the inner dimension of the output Output tensor quantization offset	Arg ("axis","(Default to 1) Indicate up to which input dimensions ""(exclusive) should be flattened to the outer dimension of the output")

Input array of type	char (byte)") .Arg("shape"

Input array of type Input tensor shape Output tensor quantization offset	SetDoc (R"DOC( Creates quantized tensor of type char(byte) with scale and zero point info. )DOC").Output(0

Input array of type int32	Arg ("shape","Input tensor shape").Arg("Y_scale"

	REGISTER_CPU_OPERATOR (Int8GivenTensorFill, int8::Int8GivenTensorFillOp)

	REGISTER_CPU_OPERATOR (Int8GivenIntTensorFill, int8::Int8GivenIntTensorFillOp)

	REGISTER_CPU_OPERATOR (Int8LeakyRelu, int8::Int8LeakyReluOp)

Coefficient of default value is Output tensor quantization offset and produces one output	data (Tensor< T >) where the function`f(x)

Coefficient of default value is Output tensor quantization offset and produces one output is applied to the data tensor elementwise DOC	Input (0,"X","1D input tensor").Output(0

	REGISTER_CPU_OPERATOR (Int8MaxPool, int8::Int8MaxPoolOp< int8::Activation::NONE >)

	REGISTER_CPU_OPERATOR (Int8MaxPoolRelu, int8::Int8MaxPoolOp< int8::Activation::RELU >)

std::function< void(OpSchema &)>	MaxPoolDocGenerator (const char *dim, bool relu_fused=false)

	REGISTER_CPU_OPERATOR (Int8Quantize, int8::Int8QuantizeOp)

Output tensor quantization scale FP32 Tensor X	Output (0,"Y","Int8 Tensor qX representing X with linear quantization.")

	REGISTER_CPU_OPERATOR (Int8Relu, int8::Int8ReluOp)

	CostInferenceFunction (CostInferenceForRelu).IdenticalTypeAndShape().SetDoc(R"DOC( Relu takes one input data (Tensor<T>) and produces one output data (Tensor<T>) where the rectified linear function

is applied to the tensor elementwise DOC input tensor	InheritOnnxSchema ("Relu")

	REGISTER_CPU_OPERATOR (Int8Reshape, int8::Int8ReshapeOp)

	SetDoc (R"DOC( Reshape the input tensor similar to numpy.reshape. It takes a tensor as input and an optional tensor specifying the new shape. When the second input is absent, an extra argument `shape` must be specified. It outputs the reshaped tensor as well as the original shape. At most one dimension of the new shape can be -1. In this case, the value is inferred from the size of the tensor and the remaining dimensions. A dimension could also be 0, in which case the actual dimension value is going to be copied from the input tensor. )DOC").Arg("shape"

New shape Output tensor quantization offset New shape	Output (0,"reshaped","Reshaped data.").Output(1

	REGISTER_CPU_OPERATOR (Int8ResizeNearest, int8::Int8ResizeNearestOp)

Output tensor quantization scale Scale along width dimension	Arg ("height_scale","Scale along height dimension").SetDoc(R"DOC( Resizes the spatial dimensions of the input using nearest neighbor interpolation. The `width_scale` and `height_scale` arguments control the size of the output

Output tensor quantization scale Scale along width dimension which is given Input Int8 tensor	Output (0,"Y","Output Int8 tensor")

	REGISTER_CPU_OPERATOR (Int8RoIAlign, int8::Int8RoIAlignOp)

Spatial scale of the input feature map X relative to the input image E if X has a stride of w r t the input image	Arg ("pooled_h","(int) default 1; Pooled output Y's height.").Arg("pooled_w"

Pooled output Y s width	Arg ("sampling_ratio","(int) default -1; number of sampling points in the interpolation grid ""used to compute the output value of each pooled output bin. If > 0, ""then exactly sampling_ratio x sampling_ratio grid points are used. If ""<= 0, then an adaptive number of grid points are used (computed as ""ceil(roi_width / pooled_w), and likewise for height).").Input(0

Pooled output Y s width Int8 Tensor feature map input of	shape (N, C, H, W).") .Input( 1

	REGISTER_CPU_OPERATOR (Int8Sigmoid, int8::Int8SigmoidOp)

Github The input tensor that s coerced into a matrix of	size (NxD)" "as described above.") .Output( 0

	REGISTER_CPU_OPERATOR (Int8Slice, int8::Int8SliceOp)

	REGISTER_CPU_OPERATOR (Int8Softmax, int8::Int8SoftmaxOp)

it will be coerced into one For an arbitrary n dimensional tensor X in[a_0, a_1,..., a_{k-1}, a_k,..., a_{n-1}] and k is the axis then X will be coerced into a dimensional tensor with dimensions[a_0 ...a_{k-1}, a_k ...a_{n-1}] For the default case where this means the X tensor will be coerced into a tensor of where a_0 is often the batch size In this we must have or else the	operator will throw errors.) DOC") .Arg ("axis","(int) default to 1; describes the axis of the inputs when coerced ""to 2D; defaults to one because the 0th axis most likely describes ""the batch_size").Input(0

float	addErrorTolerance (float scale)

std::unique_ptr< int8::Int8TensorCPU >	q (const std::vector< int64_t > &dims)

std::unique_ptr< int8::Int8TensorCPU >	biasq (const std::vector< int64_t > &dims, double scale)

std::unique_ptr< TensorCPU >	dq (const int8::Int8TensorCPU &XQ)

std::unique_ptr< TensorCPU >	biasdq (const int8::Int8TensorCPU &XQ)

void	int8Copy (int8::Int8TensorCPU *dst, const int8::Int8TensorCPU &src)

void	add_input (const vector< int64_t > &shape, const vector< float > &values, const string &name, Workspace *ws)

int	randomInt (int a, int b)

	REGISTER_CPU_OPERATOR (ReciprocalGradient, BinaryElementwiseOp< TensorTypes< float >, CPUContext, ReciprocalGradientFunctor< CPUContext >>)

	REGISTER_GRADIENT (Reciprocal, GetReciprocalGradient)

	REGISTER_CPU_OPERATOR (Reciprocal, UnaryElementwiseOp< TensorTypes< float >, CPUContext, ReciprocalFunctor< CPUContext >>)

	REGISTER_CPU_OPERATOR (ReduceFrontMax, MaxReduceDimsOp< float, CPUContext, true >)

	REGISTER_CPU_OPERATOR (ReduceFrontMaxGradient, MaxReduceDimsGradientOp< float, CPUContext, true >)

	REGISTER_CPU_OPERATOR (ReduceBackMax, MaxReduceDimsOp< float, CPUContext, false >)

	REGISTER_CPU_OPERATOR (ReduceBackMaxGradient, MaxReduceDimsGradientOp< float, CPUContext, false >)

	REGISTER_GRADIENT (ReduceFrontMax, GetReduceFrontMaxGradient)

	REGISTER_GRADIENT (ReduceBackMax, GetReduceBackMaxGradient)

int can be which enforces that only a subset of the elements are considered in the max operation If input tensor X has	shape (d_0, d_1, d_2,..., d_n)$

int can be which enforces that only a subset of the elements are considered in the max operation If input tensor X has lengths must have	shape (d_1 d_2 ...*d_{n})$.-The values of the`lengths`tensor determine how many of the values to consider for each vector in the $d_

	REGISTER_CPU_OPERATOR (ReduceFrontMean, SumReduceDimsOp< CPUContext, true, true >)

	REGISTER_CPU_OPERATOR (ReduceFrontMeanGradient, SumReduceDimsGradientOp< CPUContext, true, true >)

	REGISTER_GRADIENT (ReduceFrontMean, GetReduceFrontMeanGradient)

	REGISTER_CPU_OPERATOR (ReduceFrontSum, SumReduceDimsOp< CPUContext, true, false >)

	REGISTER_CPU_OPERATOR (ReduceFrontSumGradient, SumReduceDimsGradientOp< CPUContext, true, false >)

	REGISTER_GRADIENT (ReduceFrontSum, GetReduceFrontSumGradient)

	REGISTER_CPU_OPERATOR (ReduceBackSum, SumReduceDimsOp< CPUContext, false, false >)

	REGISTER_CPU_OPERATOR (ReduceBackSumGradient, SumReduceDimsGradientOp< CPUContext, false, false >)

	REGISTER_GRADIENT (ReduceBackSum, GetReduceBackSumGradient)

	REGISTER_CPU_OPERATOR (ReduceMin, ReduceOp< TensorTypes< std::int32_t, std::int64_t, float, double >, CPUContext, MinReducer< CPUContext >>)

	REGISTER_CPU_OPERATOR (ReduceMinGradient, ReduceGradientOp< TensorTypes< std::int32_t, std::int64_t, float, double >, CPUContext, MinReducer< CPUContext >>)

then the resulted tensor have the reduced dimension pruned DOC	Arg ("axes","A list of integers, along which to reduce.").Arg("keepdims"

then the resulted tensor have the reduced dimension pruned DOC Keep the reduced default True keeps the reduced An input tensor	Output (0,"reduced","Reduced output tensor.")

	OPERATOR_SCHEMA (ReduceMinGradient).NumInputs(3).NumOutputs(1)

	REGISTER_CPU_OPERATOR (ReduceMax, ReduceOp< TensorTypes< std::int32_t, std::int64_t, float, double >, CPUContext, MaxReducer< CPUContext >>)

	REGISTER_CPU_OPERATOR (ReduceMaxGradient, ReduceGradientOp< TensorTypes< std::int32_t, std::int64_t, float, double >, CPUContext, MaxReducer< CPUContext >>)

	OPERATOR_SCHEMA (ReduceMaxGradient).NumInputs(3).NumOutputs(1)

	REGISTER_CPU_OPERATOR (ReduceSum, ReduceOp< TensorTypes< std::int32_t, std::int64_t, float, double >, CPUContext, SumReducer< CPUContext >>)

	REGISTER_CPU_OPERATOR (ReduceSumGradient, ReduceGradientOp< TensorTypes< std::int32_t, std::int64_t, float, double >, CPUContext, SumReducer< CPUContext >>)

	REGISTER_CPU_OPERATOR (SumElements, SumElementsOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (SumElementsInt, SumElementsIntOp< int, CPUContext >)

	REGISTER_CPU_OPERATOR (SumSqrElements, SumSqrElementsOp< CPUContext >)

	REGISTER_CPU_OPERATOR (SumElementsGradient, SumElementsGradientOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (RowwiseMax, MaxReductionOp< float, CPUContext, true >)

	REGISTER_CPU_OPERATOR (RowwiseMaxGradient, MaxReductionGradientOp< float, CPUContext, true >)

	REGISTER_CPU_OPERATOR (ColwiseMaxGradient, MaxReductionGradientOp< float, CPUContext, false >)

	REGISTER_CPU_OPERATOR (ColwiseMax, MaxReductionOp< float, CPUContext, false >)

NumInputs(1).NumOutputs(1).ScalarType(TensorProto NumInputs(1).NumOutputs(1).ScalarType(TensorProto	SHOULD_NOT_DO_GRADIENT (SumElementsInt)

NumInputs(1).NumOutputs(1).ScalarType(TensorProto	OPERATOR_SCHEMA (SumElementsGradient).NumInputs(2).NumOutputs(1)

	REGISTER_GRADIENT (SumElements, GetSumElementsGradient)

	REGISTER_CPU_OPERATOR (ReluN, UnaryElementwiseWithArgsOp< TensorTypes< float >, CPUContext, ReluNFunctor< CPUContext >>)

	REGISTER_CPU_OPERATOR (ReluNGradient, BinaryElementwiseWithArgsOp< TensorTypes< float >, CPUContext, ReluNGradientFunctor< CPUContext >>)

the cap of forward op output	AllowInplace ({{1, 0}}).SetDoc(R"DOC( ReluGradient takes both Y and dY and uses this to update dX according to the chain rule and derivatives of the rectified linear function. )DOC")

	REGISTER_GRADIENT (ReluN, GetReluNGradient)

	REGISTER_CPU_OPERATOR (Relu, UnaryElementwiseOp< TensorTypes< float >, CPUContext, ReluFunctor< CPUContext >>)

	REGISTER_CPU_GRADIENT_OPERATOR (ReluGradient, BinaryElementwiseOp< TensorTypes< float >, CPUContext, ReluGradientFunctor< CPUContext >>)

	SetDoc (R"DOC( ReluGradient takes both Y and dY and uses this to update dX according to the chain rule and derivatives of the rectified linear function. )DOC")

	REGISTER_GRADIENT (Relu, GetReluGradient)

	REGISTER_CUDNN_OPERATOR (Relu, CuDNNActivationOp< CUDNN_ACTIVATION_RELU >)

	REGISTER_CUDNN_OPERATOR (ReluGradient, CuDNNActivationGradientOp< CUDNN_ACTIVATION_RELU >)

	REGISTER_CPU_OPERATOR (ReplaceNaN, ReplaceNaNOp< CPUContext >)

	SHOULD_NOT_DO_GRADIENT (ReplaceNaN)

	REGISTER_CPU_OPERATOR (Reshape, ReshapeOp< float, CPUContext >)

out[1]	set_data_type (TensorProto::INT64)

out[1]	add_dims (in[0].dims_size())

	if (!helper.HasArgument("shape"))

	CAFFE_ENFORCE_EQ (in.size(), 1,"New shape must not be specified by the input blob and the ""argument `shape` at the same time.")

	for (const auto d:in[0].dims())

	if (unknownIdx!=-1)

	for (const auto d:actualNewShape)

	REGISTER_CUDA_OPERATOR (Reshape, ReshapeOp< float, CUDAContext >)

void	resizeNearestNCHW2x (int batch_size, int num_channels, int input_height, int input_width, const float input, float output)

	REGISTER_CPU_OPERATOR (ResizeNearest, ResizeNearestOp< float, CPUContext >)

	REGISTER_CPU_GRADIENT_OPERATOR (ResizeNearestGradient, ResizeNearestGradientOp< float, CPUContext >)

Scale along width dimension which is given Input tensor	Input (1,"scales","1D, 2-element, Scales tensor, [height_scale, width_scale]").Output(0

Scale along width dimension which is given Input tensor Output tensor	InheritOnnxSchema ("Upsample")

	REGISTER_GRADIENT (ResizeNearest, GetResizeNearestGradient)

	REGISTER_CPU_OPERATOR (ReversePackedSegs, ReversePackedSegsOp< CPUContext >)

leaving paddings unchanged This	operator is used to reverse input of a recurrent neural network to make it a BRNN.) DOC") .Input (0,"data","a 3-D (lengths, segments, embeddings,) tensor.").Input(1

leaving paddings unchanged This length of each segment	Output (0,"reversed data","a (lengths, segments, embeddings,) tensor with each segment reversed""and paddings unchanged.")

	REGISTER_GRADIENT (ReversePackedSegs, GetReversePackedSegsGradient)

	REGISTER_CPU_OPERATOR (RMACRegions, RMACRegionsOp< CPUContext >)

	REGISTER_CPU_OPERATOR (RecurrentNetworkBlobFetcher, RecurrentNetworkBlobFetcherOp< CPUContext >)

Prefix string to prepend extracted blobs	Input (0,"ScratchWorkspaceBlob","Name of scratch workspace blob returned by recurrent network.").Output(0

	SHOULD_NOT_DO_GRADIENT (RecurrentNetworkBlobFetcher)

	REGISTER_CUDA_OPERATOR (RecurrentNetworkBlobFetcher, RecurrentNetworkBlobFetcherOp< CUDAContext >)

template<>
std::unique_ptr< RecurrentNetworkExecutorBase >	createRNNExecutor< CPUContext > (const NetDef &step_net_def, std::map< string, string > &recurrent_input_map, std::string timestep_blob, ArgumentHelper rnn_args)
	Implementation of RecurrentNetworkExecutor that uses thread pool for multithreaded execution of RNNs. More...

template<class Context >
std::unique_ptr< RecurrentNetworkExecutorBase >	createRNNExecutor (const NetDef &step_net_def, std::map< string, string > &recurrent_input_map, std::string timestep_blob, ArgumentHelper rnn_args)

template<>
std::unique_ptr< RecurrentNetworkExecutorBase >	createRNNExecutor< CUDAContext > (const NetDef &step_net_def, std::map< string, string > &recurrent_input_map, std::string timestep_blob, ArgumentHelper arg_helper)

	CAFFE_KNOWN_TYPE (detail::ScratchWorkspaces)

	REGISTER_CPU_OPERATOR (RecurrentNetwork, RecurrentNetworkOp< CPUContext >)

INT_MAX	SetDoc (R"DOC( Run the input network in a recurrent fashion. This can be used to implement fairly general recurrent neural networks (RNNs). The operator proceeds as follows. - First, initialized the states from the input recurrent states - For each timestep T, apply the links (that map offsets from input/output tensors into the inputs/outputs for the `step` network) - Finally, alias the recurrent states to the specified output blobs. This is a fairly special-case meta-operator, and so the implementation is somewhat complex. It trades of generality (and frankly usability) against performance and control (compared to e.g. TF dynamic_rnn, Theano scan, etc). See the usage examples for a flavor of how to use it. )DOC")

	REGISTER_CPU_OPERATOR (RecurrentNetworkGradient, RecurrentNetworkGradientOp< CPUContext >)

	OPERATOR_SCHEMA (RecurrentNetworkGradient)

	REGISTER_CPU_OPERATOR (rnn_internal_accumulate_gradient_input, AccumulateInputGradientOp< CPUContext >)

INT_MAX	EnforceInplace ({{2, 0}}).Private().SetDoc(R"DOC( Internal RNN operator. )DOC")

	REGISTER_CPU_OPERATOR (rnn_internal_apply_link, RNNApplyLinkOp< CPUContext >)

	Private ().SetDoc(R"DOC( Internal RNN operator. )DOC")

	REGISTER_GRADIENT (RecurrentNetwork, GetRecurrentNetworkGradient)

	REGISTER_CUDNN_OPERATOR (Recurrent, RecurrentOp< float >)

	OPERATOR_SCHEMA (Recurrent).NumInputs(4).NumOutputs(5).SetDoc(R"DOC( Recurrent wraps the CuDNN R5 RNN implementation. See the CuDNN R5 documentation for more information. In general

the implementation takes an	input (TxNxD) tensor

the implementation takes an the hidden state	input (NxD)

the implementation takes an the hidden state the cell and a weight	tensor (effectively an opaque blob, where the size and layout is dictated by CuDNN).The outputs are the output(again

the implementation takes an the hidden state the cell and a weight the final hidden cell	states (NxD).These can be reset(at sequence boundaries across minibatches) by multiplying by zero.The CuDNN arguments(hidden_size

	REGISTER_CUDNN_OPERATOR (RecurrentGradient, RecurrentGradientOp< float >)

	NumInputs (7).NumOutputs(6).AllowInplace(

	REGISTER_CUDNN_OPERATOR (RecurrentParamSet, RecurrentParamAccessOp< float, SET_PARAM >)

	SetDoc ("Set individual parameters of a recurrent net.").Arg("param_type"

R	DOC (Type of param to be set:"input_gate_w","forget_gate_w","cell_w","output_gate_w""input_gate_b","forget_gate_b","cell_b","output_gate_b") DOC") .Arg("input_type"

R recurrent or input	Arg ("layer","layer index (starting from 0)").Input(0

R recurrent or input R	DOC (Input blob.Needed for inferring the shapes.A dummy tensor matching the input shape is ok.) DOC") .Input(1

R recurrent or input R Blob holding all the parameters	Input (2,"param","Values for the specified parameter").Output(0

R recurrent or input R Blob holding all the parameters Blob holding all the	parameters (same as input(1))")

	REGISTER_CUDNN_OPERATOR (RecurrentParamGet, RecurrentParamAccessOp< float, GET_PARAM >)

R recurrent or input R Blob holding all the parameters	Output (0,"param","Blob holding the requested values")

	REGISTER_GRADIENT (Recurrent, GetRecurrentGradient)

	REGISTER_CPU_OPERATOR (RoIAlignGradient, RoIAlignGradientOp< float, CPUContext >)

See RoIPoolF	Input (1,"RoIs","See RoIPoolF.").Input(2

See RoIPoolF Gradient of forward	output (Y)") .Output(0

See RoIPoolF Gradient of forward Gradient of forward	input (X)")

	REGISTER_GRADIENT (RoIAlign, GetRoIAlignGradient)

	REGISTER_CPU_OPERATOR (RoIAlign, RoIAlignOp< float, CPUContext >)

See RoIAlignRotated	Input (1,"RoIs","See RoIAlignRotated.").Input(2

	REGISTER_GRADIENT (RoIAlignRotated, GetRoIAlignRotatedGradient)

	REGISTER_CPU_OPERATOR (RoIAlignRotated, RoIAlignRotatedOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (RoIPool, RoIPoolOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (RoIPoolGradient, RoIPoolGradientOp< float, CPUContext >)

	TensorInferenceFunction ([](const OperatorDef &def, const vector< TensorShape > &in){ArgumentHelper helper(def);const StorageOrder order=StringToStorageOrder(helper.GetSingleArgument< string >("order","NCHW"));const TensorShape &X=in[0];const int num_channels=(order==StorageOrder::NCHW?X.dims(1):X.dims(3));const TensorShape &R=in[1];const int num_rois=R.dims(0);const int pooled_height=helper.GetSingleArgument< int >("pooled_h", 1);const int pooled_width=helper.GetSingleArgument< int >("pooled_w", 1);TensorShape Y=CreateTensorShape(vector< int >({num_rois, num_channels, pooled_height, pooled_width}), X.data_type());bool is_test=helper.GetSingleArgument< int >(OpSchema::Arg_IsTest, 0);if(!is_test){TensorShape argmaxes=Y;argmaxes.set_data_type(TensorProto_DataType_INT32);return vector< TensorShape >({Y, argmaxes});}else{return vector< TensorShape >({Y});}}).SetDoc(R"DOC( Carries out ROI Pooling for Faster-RCNN. Depending on the mode

there are multiple output	argmaxes (train mode) Output case) DOC") .Arg( "is_test"

there are multiple output If run in test mode and skip computation of argmaxes(used for" "gradient computation).Only one output tensor is produced." "(Default	OPERATOR_SCHEMA (RoIPoolGradient).NumInputs(4).NumOutputs(1)

	REGISTER_GRADIENT (RoIPool, GetRoIPoolGradient)

	REGISTER_CPU_OPERATOR (Rsqrt, UnaryElementwiseOp< TensorTypes< float >, CPUContext, RsqrtFunctor< CPUContext >>)

	REGISTER_CPU_OPERATOR (RsqrtGradient, BinaryElementwiseOp< TensorTypes< float >, CPUContext, RsqrtGradientFunctor< CPUContext >>)

ND input tensor	Output (0,"Y","ND output tensor")

	REGISTER_GRADIENT (Rsqrt, GetRsqrtGradient)

	REGISTER_CPU_OPERATOR (Scale, ScaleOp< CPUContext >)

	REGISTER_GRADIENT (Scale, GetScaleGradient)

	REGISTER_CUDA_OPERATOR (Scale, ScaleOp< CUDAContext >)

OpSchema::Cost	CostInferenceForSparseLengths (const OperatorDef &def, const vector< TensorShape > &inputs, bool use_weight)

	REGISTER_CPU_OPERATOR (SparseLengthsIndicesInGradientWeightedSumWithMainInputGradient, AbstractLengthsWithMainInputGradientOp< float, float, int, CPUContext, WeightedSumReducerDef::template ReducerGradient< float, CPUContext >, true, true >)

	REGISTER_CPU_OPERATOR (SparseLengthsIndicesInGradientWeightedSumGradient, AbstractLengthsGradientOp< float, int, CPUContext, WeightedSumReducerDef::template ReducerGradient< float, CPUContext >, true >)

	REGISTER_CPU_OPERATOR (SparseLengthsIndicesInGradientSumGradient, AbstractLengthsGradientOp< float, int, CPUContext, SumReducerDef::template ReducerGradient< float, CPUContext >, true >)

	OPERATOR_SCHEMA (LengthsIndicesInGradientSumGradient).NumInputs(3).NumOutputs(1)

	REGISTER_CPU_OPERATOR (LengthsIndicesInGradientSumGradient, AbstractLengthsGradientOp< float, int, CPUContext, SumReducerDef::template ReducerGradient< float, CPUContext >, true >)

	REGISTER_CPU_OPERATOR (SparseLengthsIndicesInGradientMeanGradient, AbstractLengthsGradientOp< float, int, CPUContext, MeanReducerDef::template ReducerGradient< float, CPUContext >, true >)

	REGISTER_CPU_OPERATOR (LengthsIndicesInGradientMeanGradient, AbstractLengthsGradientOp< float, int, CPUContext, MeanReducerDef::template ReducerGradient< float, CPUContext >, true >)

	REGISTER_LENGTHS_OPS_MAIN_INPUT_AND_FORWARD_OUTPUT_GRADIENT (LengthsMax, LengthsMaxWithMainInputAndForwardOutputGradient, AbstractLengthsDef< float, int, CPUContext, MaxReducerDef >)

	REGISTER_CPU_OPERATOR (Selu, SeluOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (SeluGradient, SeluGradientOp< float, CPUContext >)

affects the activation function itself	Input (0,"Y","input tensor").Input(1

	REGISTER_GRADIENT (Selu, GetSeluGradient)

	REGISTER_CPU_OPERATOR (AddPadding, AddPaddingOp< CPUContext >)

	REGISTER_CPU_OPERATOR (RemovePadding, RemovePaddingOp< CPUContext >)

	REGISTER_CPU_OPERATOR (GatherPadding, GatherPaddingOp< CPUContext >)

	REGISTER_CPU_OPERATOR (PadEmptySamples, PadEmptySamplesOp< CPUContext >)

	REGISTER_GRADIENT (AddPadding, GetAddPaddingGradient)

	REGISTER_GRADIENT (RemovePadding, GetRemovePaddingGradient)

	SetDoc (R"DOC( Given a partitioned tensor $T<N, D_1, ..., D_n>$, where the partitions are defined as ranges on its outer-most (slowest varying) dimension $N$, return a tensor $T<(N + 2 * padding\_width), D_1, ..., D_n>$ with paddings added to the start and end of each range. Optionally, different paddings can be provided for beginning and end. Paddings provided must be a tensor $T<D_1, ..., D_n>$. If no padding is provided, add zero padding. If no lengths vector is provided, add padding only once, at the start and end of data. Github Links: - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/sequence_ops.cc <details> <summary> <b>Example</b> </summary> Code ``` workspace.ResetWorkspace() op = core.CreateOperator( "AddPadding", ["X", "lengths"], ["Y", "lengths_out"], padding_width=1 ) workspace.FeedBlob("X", (np.random.rand(3,2,2).astype(np.float32))) workspace.FeedBlob("lengths", np.array([3]).astype(np.int32)) print("X:", workspace.FetchBlob("X")) workspace.RunOperatorOnce(op) print("Y:", workspace.FetchBlob("Y")) print("lengths_out:", workspace.FetchBlob("lengths_out")) ``` Result ``` X: [[[0.2531572 0.4588472 ] [0.45140603 0.61161053]] [[0.92500854 0.8045306 ] [0.03356671 0.30233648]] [[0.4660227 0.6287745 ] [0.79372746 0.08609265]]] Y: [[[0. 0. ] [0. 0. ]] [[0.2531572 0.4588472 ] [0.45140603 0.61161053]] [[0.92500854 0.8045306 ] [0.03356671 0.30233648]] [[0.4660227 0.6287745 ] [0.79372746 0.08609265]] [[0. 0. ] [0. 0. ]]] lengths_out: [5] ``` </details> )DOC").Arg("padding_width"

will use same as padding_width	Input (0,"data_in","(type: Tensor) Input data ($T<N, D_1, ..., D_n>$).").Input(1

will use same as padding_width D_n	Input (3,"end_padding","(type: Tensor`<int>`) [OPTIONAL] Padding for range end. If not ""provided, `start_padding` is used ($T<D_1, ..., D_n>$).").Output(0

will use same as padding_width D_n D_n	Output (1,"lengths_out","(type: Tensor`<int>`) [OPTIONAL] Lengths for each padded range.")

	SetDoc (R"DOC( Remove padding around the edges of each segment of the input data. This is the reverse operation of AddPadding, and uses the same arguments and conventions for input and output data format. Github Links: - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/sequence_ops.cc <details> <summary> <b>Example</b> </summary> Code ``` workspace.ResetWorkspace() addpad_op = core.CreateOperator( "AddPadding", ["X", "lengths_add"], ["Y", "lengths_out_add"], padding_width=1 ) rmpad_op = core.CreateOperator( "RemovePadding", ["Y", "lengths_rm"], ["Z", "lengths_out_rm"], padding_width=1 ) workspace.FeedBlob("X", (np.random.randint(20, size=(3,5)))) workspace.FeedBlob("lengths_add", np.array([3]).astype(np.int32)) workspace.FeedBlob("lengths_rm", np.array([5]).astype(np.int32)) print("X:", workspace.FetchBlob("X")) workspace.RunOperatorOnce(addpad_op) print("Y:", workspace.FetchBlob("Y")) print("lengths_out_add:", workspace.FetchBlob("lengths_out_add")) workspace.RunOperatorOnce(rmpad_op) print("Z:", workspace.FetchBlob("Z")) print("lengths_out_rm:", workspace.FetchBlob("lengths_out_rm")) ``` Result ``` X: [[17 19 1 9 1] [19 3 5 19 1] [16 0 0 0 4]] Y: [[ 0 0 0 0 0] [17 19 1 9 1] [19 3 5 19 1] [16 0 0 0 4] [ 0 0 0 0 0]] lengths_out_add: [5] Z: [[17 19 1 9 1] [19 3 5 19 1] [16 0 0 0 4]] lengths_out_rm: [3] ``` </details> )DOC").Arg("padding_width"

will use same as padding_width	Input (0,"data_in","Input tensor ($T<N, D_1, ..., D_n>$).").Input(1

will use same as padding_width considers all data as a single segment	Output (0,"data_out","(type: Tensor) Padded data tensor ""($T<N + 2*padding_width, D_1, ..., D_n>$).").Output(1

	SetDoc (R"DOC( Gather the sum of start and end paddings in a padded input sequence. Used in order to compute the gradients of AddPadding w.r.t the padding tensors. )DOC").Arg("padding_width"

Outer size of padding present around each range	Arg ("end_padding_width","(Optional) Specifies a different end-padding width.").Input(0

Outer size of padding present around each range T< N, D1..., Dn > Padded input data	Input (1,"lengths","(i64) Num of elements in each range. sum(lengths) = N. ""If not provided, considers all data as a single segment.").Output(0

Outer size of padding present around each range T< N, D1..., Dn > Padded input data Sum of all start or of all paddings if end_padding_sum is not provided	Output (1,"end_padding_sum","T<D1..., Dn> Sum of all end paddings, if provided.")

INT_MAX	SetDoc (R"DOC( Pad empty field given lengths and index features, Input(0) is a blob pointing to the lengths of samples in one batch, [Input(1),... Input(num_fields)] a list of tensors containing the data for each field of the features. PadEmptySamples is thread safe. )DOC").Input(0

INT_MAX A blob containing a pointer to the lengths	Output (0,"out_lengths","Tensor containing lengths with empty sample padded.")

	REGISTER_CPU_OPERATOR (Shape, ShapeOp< CPUContext >)

this	operator only returns the dimensions of the given axes." "Otherwise, the operator returns the dimensions of all axes.") .TensorInferenceFunction ([](const OperatorDef &def, const vector< TensorShape > &in){ArgumentHelper args(def);const vector< int > &axes=args.GetRepeatedArgument< int >("axes");vector< TensorShape > out(1);if(axes.empty()){out[0].add_dims(in[0].dims().size());}else{out[0].add_dims(axes.size());}out[0].set_data_type(TensorProto::INT64);return out;}).SetDoc(R"DOC( Produce a 1D int64 tensor with the shape of the input tensor. If called with an optional argument `axes`

this the result will only contain the dimensions of specified axes Github workspace FeedBlob("X",(np.random.randint(10, size=(2, 3)))) print("X	SHOULD_NOT_DO_GRADIENT (Shape)

	REGISTER_CUDA_OPERATOR (Shape, ShapeOp< CUDAContext >)

	REGISTER_CPU_OPERATOR (SigmoidGradient, BinaryElementwiseOp< TensorTypes< float >, CPUContext, SigmoidGradientFunctor< CPUContext >>)

	REGISTER_GRADIENT (Sigmoid, GetSigmoidGradient)

	REGISTER_CPU_OPERATOR (Sigmoid, UnaryElementwiseOp< TensorTypes< float >, CPUContext, SigmoidFunctor< CPUContext >>)

	SetDoc (R"DOC( SigmoidGradient takes both Y and dY and uses this to update dX according to the chain rule and derivatives of the sigmoid function. )DOC")

	REGISTER_CUDNN_OPERATOR (Sigmoid, CuDNNActivationOp< CUDNN_ACTIVATION_SIGMOID >)

	REGISTER_CUDNN_OPERATOR (SigmoidGradient, CuDNNActivationGradientOp< CUDNN_ACTIVATION_SIGMOID >)

	REGISTER_CPU_OPERATOR (Sin, UnaryElementwiseOp< TensorTypes< float >, CPUContext, SinFunctor< CPUContext >>)

	REGISTER_CPU_OPERATOR (SinGradient, BinaryElementwiseOp< TensorTypes< float >, CPUContext, SinGradientFunctor< CPUContext >>)

element wise Github workspace FeedBlob("X", np.random.rand(5).astype(np.float32)) print("X	OPERATOR_SCHEMA (SinGradient).NumInputs(2).NumOutputs(1).IdenticalTypeAndShape()

	REGISTER_GRADIENT (Sin, GetSinGradient)

	REGISTER_CPU_OPERATOR (Sinh, UnaryElementwiseOp< TensorTypes< float >, CPUContext, SinhFunctor< CPUContext >>)

	REGISTER_CPU_OPERATOR (SinhGradient, BinaryElementwiseOp< TensorTypes< float >, CPUContext, SinhGradientFunctor< CPUContext >>)

	REGISTER_GRADIENT (Sinh, GetSinhGradient)

	REGISTER_CPU_OPERATOR (SinusoidPositionEncoding, SinusoidPositionEncodingOp< CPUContext >)

	REGISTER_CPU_OPERATOR (Slice, SliceOp< CPUContext >)

	REGISTER_CPU_GRADIENT_OPERATOR (SliceGradient, SliceGradientOp< CPUContext >)

vector< int >	dst_sizes (data.dims_size())

	Output (0,"Y","(Tensor): sliced output tensor").InheritOnnxSchema()

	GRADIENT_OPERATOR_SCHEMA (SliceGradient)

	REGISTER_GRADIENT (Slice, GetSliceGradient)

	REGISTER_CPU_OPERATOR (Softmax, SoftmaxOp< float, CPUContext >)

	REGISTER_CPU_GRADIENT_OPERATOR (SoftmaxGradient, SoftmaxGradientOp< float, CPUContext >)

and sum to The softmax	operator is typically the last layer in a classifier network, as its output can be interpreted as confidence probabilities of an input belonging to each class.The input is a 2-D tensor (Tensor) of size(batch_size x input_feature_dimensions).The output tensor has the same shape and contains the softmax normalized values of the corresponding input.The softmax function is defined as follows

void	SoftmaxCPU (CPUContext &context, const int N, const int D, const float Xdata, float Ydata, float scale, const float sum_multiplier, bool logarithmic, float *rowmax)

	REGISTER_CPU_OPERATOR (SoftmaxWithLoss, SoftmaxWithLossOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (SoftmaxWithLossGradient, SoftmaxWithLossGradientOp< float, CPUContext >)

vector< TensorShape >	out (2)

out[0]	set_data_type (logits.data_type())

out[0]	add_dims (batch_size)

out[0]	add_dims (num_classes)

	SetDoc (R"DOC( Combined Softmax and Cross-Entropy loss operator. The operator first computes the softmax normalized values for each layer in the batch of the given input, then computes cross-entropy loss. This operator is numerically more stable than separate `Softmax` and `CrossEntropy` ops. The inputs are a 2-D tensor `logits` of size (batch_size x input_feature_dimensions), which represents the unscaled log probabilities, and a 1-dimensional integer `labels` tensor for ground truth. An optional third input blob (`weight_tensor`) can be used to weight the samples for the loss, which is useful if the training set is unbalanced. This operator outputs a `softmax` tensor which contains the probability for each label for each example (same shape is `logits` input), and a scalar `loss` value, which is the averaged cross-entropy loss between the softmax probabilities and the ground truth values. Use parameter `label_prob`=1 to enable inputting labels as a probability distribution. Softmax cross-entropy loss function: $$loss(x, class) = -\log{\biggl(\frac{\exp(x[class])}{\sum_{j} \exp(x[j])}\biggr)} = -x[class] + \log{\biggl(\sum_{j} \exp(x[j])\biggr)}$$ or if the `weight_tensor` has been passed: $$loss(x, class) = weight[class]\biggl(-x[class] + \log{\biggl(\sum_{j} \exp(x[j])\biggr)}\biggr)$$ The `logits` input does not need to explicitly be a 2D vector; rather, it will be coerced into one. For an arbitrary n-dimensional tensor `X` in $[a_0, a_1, ..., a_{k-1}, a_k, ..., a_{n-1}]$, where k is the `axis` provided, then `X` will be coerced into a 2-dimensional tensor with dimensions $[(a_0 * ... * a_{k-1}), (a_k * ... * a_{n-1})]$. For the default case where `axis`=1, the `X` tensor will be coerced into a 2D tensor of dimensions $[a_0, (a_1 * ... * a_{n-1})]$, where $a_0$ is often the batch size. In this situation, we must have $a_0 = N$ and $a_1 * ... * a_{n-1} = D$. Each of these dimensions must be matched correctly, or else the operator will throw errors. Github Links: - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/softmax_with_loss_op.cc <details> <summary> <b>Example</b> </summary> Code ``` workspace.ResetWorkspace() op = core.CreateOperator( "SoftmaxWithLoss", ["logits", "labels"], ["softmax", "avgloss"] ) workspace.FeedBlob("logits", np.random.randn(1, 5).astype(np.float32)) workspace.FeedBlob("labels", np.asarray([4]).astype(np.int32)) print("logits:", workspace.FetchBlob("logits")) print("labels:", workspace.FetchBlob("labels")) workspace.RunOperatorOnce(op) print("softmax:", workspace.FetchBlob("softmax")) print("avgloss:", workspace.FetchBlob("avgloss")) ``` Result ``` logits: [[-0.3429451 -0.80375195 0.23104447 1.4569176 -0.5268362 ]] labels: [4] softmax: [[0.09721052 0.0613179 0.17258129 0.58800864 0.0808817 ]] avgloss: 2.5147676 ``` </details> <details> <summary> <b>Example 2</b> </summary> Code ``` workspace.ResetWorkspace() op = core.CreateOperator( "SoftmaxWithLoss", ["logits", "labels"], ["softmax", "avgloss"], scale=5.0 ) workspace.FeedBlob("logits", np.asarray([[.1, .4, .7, 1.5, .2]]).astype(np.float32)) workspace.FeedBlob("labels", np.asarray([4]).astype(np.int32)) print("logits:", workspace.FetchBlob("logits")) print("labels:", workspace.FetchBlob("labels")) workspace.RunOperatorOnce(op) print("softmax:", workspace.FetchBlob("softmax")) print("avgloss:", workspace.FetchBlob("avgloss")) ``` Result ``` logits: [[0.1 0.4 0.7 1.5 0.2]] labels: [4] softmax: [[0.10715417 0.144643 0.19524762 0.4345316 0.11842369]] avgloss: 10.667433 ``` </details> )DOC").Arg("label_prob"

	OPERATOR_SCHEMA (SoftmaxWithLossGradient).NumOutputs(1)

	REGISTER_CPU_OPERATOR (Softplus, SoftplusOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (SoftplusGradient, SoftplusGradientOp< float, CPUContext >)

	REGISTER_GRADIENT (Softplus, GetSoftplusGradient)

	REGISTER_CPU_OPERATOR (Softsign, UnaryElementwiseOp< TensorTypes< float >, CPUContext, SoftsignFunctor< CPUContext >>)

	REGISTER_CPU_GRADIENT_OPERATOR (SoftsignGradient, BinaryElementwiseOp< TensorTypes< float >, CPUContext, SoftsignGradientFunctor< CPUContext >>)

	SetDoc (R"DOC( Calculates the softsign gradient (sgn(x)/(1+\|x\|)^2) of the given input tensor element-wise. )DOC").Input(0

D input tensor	Input (1,"input","1-D input tensor").Output(0

D input tensor The softsign	gradient (sgn(x)/(1+\|x\|)^2) values of the input tensor" "computed element-wise")

	REGISTER_GRADIENT (Softsign, GetSoftsignGradient)

	REGISTER_CPU_OPERATOR (SpaceToBatch, SpaceToBatchOp< CPUContext >)

	OPERATOR_SCHEMA (SpaceToBatch).NumInputs(1).NumOutputs(1).SetDoc(R"DOC( Zero-pads and then rearranges (permutes) blocks of spatial data into batch. More specifically

only NCHW order is currently	supported (default=\"NCHW\")").Input(0

	REGISTER_CPU_OPERATOR (BatchToSpace, BatchToSpaceOp< CPUContext >)

	OPERATOR_SCHEMA (BatchToSpace).NumInputs(1).NumOutputs(1).SetDoc(R"DOC( Rearranges (permutes) data from batch into blocks of spatial data

	REGISTER_GRADIENT (SpaceToBatch, GetSpaceToBatchGradient)

	REGISTER_GRADIENT (BatchToSpace, GetBatchToSpaceGradient)

template<typename Context >
void	spaceToBatch (const Tensor &input, int pad_t, int pad_l, int block_size, Tensor output, Context )

template<typename Context >
void	batchToSpace (const Tensor &input, int pad_t, int pad_l, int block_size, Tensor output, Context )

	REGISTER_CPU_OPERATOR (SparseNormalize, SparseNormalizeOp< float, CPUContext >)

Parameters to be normalized	Input (1,"indices","Sparse indices").Input(2

Parameters to be normalized Gradient computed	Output (0,"output_param","Normalized parameters").EnforceOneToOneInplace().Arg("use_max_norm"

Parameters to be normalized Gradient computed A bool variable to control whether to use max norm or constant norm When constant norm is used so that all the embedding vectors are scaled to have a L2 norm equals to	A (see blow arugment norm=A).If use_max_norm

Parameters to be normalized Gradient computed A bool variable to control whether to use max norm or constant norm When constant norm is used so that all the embedding vectors are scaled to have a L2 norm equals to max norm is used so that embedding is scaled so that its l2 norm is no larger than A If an embedding s norm is less than A the embedding is left unchanged The default is True	Arg ("norm","L2 norm of the embedding. The default is 1.0.").SetDoc(R"DOC( Given a sparse matrix

	SHOULD_NOT_DO_GRADIENT (SparseNormalize)

	REGISTER_CPU_OPERATOR (SparseToDense, SparseToDenseOp< CPUContext >)

value represented as indices vector and values tensor into a compacted tensor where the first dimension is determined by the first dimension of the input if it is given or the max index Missing values are filled with zeros The op supports duplicated indices and performs summation over corresponding values This behavior is useful for converting GradientSlices into dense representation After running this	len (mask)]+shape(default_value)`(if`lengths`is not provided the" "first dimension is omitted)")

	REGISTER_CPU_OPERATOR (SpatialBNGradient, SpatialBNGradientOp< CPUContext >)

	NumInputs ({5, 7}).NumOutputs(3).AllowInplace(

	REGISTER_GRADIENT (SpatialBN, GetSpatialBNGradient)

	REGISTER_CPU_OPERATOR (SpatialBN, SpatialBNOp< CPUContext >)

	AllowInplace ({{0, 0},{5, 3},{6, 4}}).EnforceInplace(

	CostInferenceFunction (CostInferenceForSpatialBN).TensorInferenceFunction([](const OperatorDef &def

	if (!is_test)

SetDoc(R"DOC( Applies spatial batch normalization to the input tensor as described in the original paper, [Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift](https://arxiv.org/abs/1502.03167). Be aware, this operator has two different output sets, depending on the value of is_test. According to the paper, the primary operation of spatial batch normalization is: $$Y = \frac{X - \mu_x}{\sqrt{\sigma^2_{x} + \epsilon}}\gamma + b$$ In the equation, $\mu_x$ is the mean, $X$ is the input data, $\sigma^2_{x}$ is the var, $\epsilon$ is epsilon, $\gamma$ is the scale, $b$ is the bias, and $Y$ is the output data. The momentum* arg also affects this calculation in the computation of the running mean and variance. The influence of momentum is as follows: $$running\_mean = running\_mean * momentum + mean * (1 - momentum)$$ $$running\_var = running\_var * momentum + var * (1 - momentum)$$ Output when is_test = 0 (train mode): Y, mean, var, saved_mean, saved_var Output when is_test = 1 (test mode): Y Github Links: - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/spatial_batch_norm_op.cc - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/spatial_batch_norm_op.h )DOC").ArgIsTest("*(type run spatial batch normalization in test mode	Arg ("epsilon","(type: float; default: 1e-5) The epsilon value to use to avoid division by zero.").Arg("order"

default where $N is batch $C is number of $H is spatial and $W is spatial width The only other valid option is NHWC	Arg ("momentum","(type: float; default: 0.9) Factor used in computing the running mean and variance. e.g., running_mean = running_mean x momentum + mean x (1 - momentum)").Arg("num_batches"

default The input dimensional tensor of shape $NCHW or $NHWC depending on the order parameter	Input (1,"scale","The scale as a 1-dimensional tensor of size $C$ to be applied to the output.").Input(2

default The input dimensional tensor of shape $NCHW or $NHWC depending on the order parameter The bias as a dimensional tensor of size $C to be applied to the output	Input (3,"mean","The running mean (training) or the estimated mean (testing) as a 1-dimensional tensor of size $C$.").Input(4

default The input dimensional tensor of shape $NCHW or $NHWC depending on the order parameter The bias as a dimensional tensor of size $C to be applied to the output The running	variance (training) or the estimated variance(testing) as a 1-dimensional tensor of size $C $.") .Input( 5

default The input dimensional tensor of shape $NCHW or $NHWC depending on the order parameter The bias as a dimensional tensor of size $C to be applied to the output The running optional Per channel sums of elements to be used to determine the mean and variance for this batch	Input (6,"sumsq","(optional) Per-channel sum of elements squared per channel to be used to determine the variance for this batch.").Output(0

default The input dimensional tensor of shape $NCHW or $NHWC depending on the order parameter The bias as a dimensional tensor of size $C to be applied to the output The running optional Per channel sums of elements to be used to determine the mean and variance for this batch The output dimensional tensor of the same shape as $X	Output (1,"mean","The running mean after the spatial BN operator. Must be in-place with the input mean. Should not be used for testing.").Output(2

default The input dimensional tensor of shape $NCHW or $NHWC depending on the order parameter The bias as a dimensional tensor of size $C to be applied to the output The running optional Per channel sums of elements to be used to determine the mean and variance for this batch The output dimensional tensor of the same shape as $X The running variance after the spatial BN	*operator.Must be in-place with the input var .Should not be used for testing.") .Output* (3,"saved_mean","Saved mean used during training to speed up gradient computation. Should not be used for testing.").Output(4

default The input dimensional tensor of shape $NCHW or $NHWC depending on the order parameter The bias as a dimensional tensor of size $C to be applied to the output The running optional Per channel sums of elements to be used to determine the mean and variance for this batch The output dimensional tensor of the same shape as $X The running variance after the spatial BN Saved variance used during training to speed up gradient computation Should not be used for testing	InheritOnnxSchema ("BatchNormalization")

	REGISTER_CPU_OPERATOR (SpatialSoftmaxWithLoss, SpatialSoftmaxWithLossOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (SpatialSoftmaxWithLossGradient, SpatialSoftmaxWithLossGradientOp< float, CPUContext >)

	CAFFE_ENFORCE_EQ (logits.dims_size(), 4)

	CAFFE_ENFORCE_EQ (labels.dims_size(), 3)

out[0]	add_dims (in[0].dims(2))

out[0]	add_dims (in[0].dims(3))

	SetDoc (R"DOC( Combined Spatial Softmax and Cross-Entropy loss operator. Similar to SoftmaxWithLoss, this operator computes the spatial softmax normalized values for each layer in the batch of the given input, after which cross-entropy loss is computed. This operator is numerically more stable than separate Softmax and CrossEntropy ops. The inputs are a 2-D tensor (Tensor) of size (batch_size x input_feature_dimensions) and tensor of labels (ground truth). Output is tensor with the probability for each label in a pixel for each example (N x D x W x H) and averaged loss (scalar). For spatial softmax, weighting is by x,y position of the input. )DOC").Input(0

Unscaled log probabilities	Input (1,"labels","Ground truth").Input(2

Unscaled log probabilities Optional blob to be used to weight the samples for the loss With spatial weighting is by y of the input	Output (0,"softmax","Tensor with softmax cross entropy loss").Output(1

	OPERATOR_SCHEMA (SpatialSoftmaxWithLossGradient).NumOutputs(1)

	REGISTER_CPU_OPERATOR (Sqr, UnaryElementwiseOp< TensorTypes< float >, CPUContext, SqrFunctor< CPUContext >>)

	REGISTER_CUDA_OPERATOR (Sqr, UnaryElementwiseOp< TensorTypes< float >, CUDAContext, SqrFunctor< CUDAContext >>)

	REGISTER_CPU_OPERATOR (Sqrt, UnaryElementwiseOp< TensorTypes< float >, CPUContext, SqrtFunctor< CPUContext >>)

	REGISTER_GRADIENT (Sqrt, GetSqrtGradient)

	REGISTER_CUDA_OPERATOR (Sqrt, UnaryElementwiseOp< TensorTypes< float >, CUDAContext, SqrtFunctor< CUDAContext >>)

	REGISTER_CPU_OPERATOR (SquareRootDivide, SquareRootDivideOp< CPUContext >)

	SetDoc (R"DOC( Given DATA tensor with first dimension N and SCALE vector of the same size N produces an output tensor with same dimensions as DATA. Which consists of DATA slices. i-th slice is divided by sqrt(SCALE[i]) elementwise. If SCALE[i] == 0 output slice is identical to the input one (no scaling) Example: Data = [ [2.0, 4.0], [9.0, 12.0] ] SCALE = [4, 9] OUTPUT = [ [1.0, 2.0], [3.0, 4.0] ] )DOC")

	REGISTER_GRADIENT (SquareRootDivide, GetSquareRootDivideGradient)

	REGISTER_CPU_OPERATOR (StatRegistryCreate, StatRegistryCreateOp)

	REGISTER_CPU_OPERATOR (StatRegistryUpdate, StatRegistryUpdateOp)

	REGISTER_CPU_OPERATOR (StatRegistryExport, StatRegistryExportOp)

	REGISTER_CPU_OPERATOR (TimerBegin, TimerBeginOp)

	REGISTER_CPU_OPERATOR (TimerEnd, TimerEndOp)

	REGISTER_CPU_OPERATOR (TimerGetAndEnd, TimerGetAndEndOp)

	REGISTER_CPU_OPERATOR (TimerGet, TimerGetOp)

or the global with the values of counters for the given keys DOC	Input (0,"keys","1D string tensor with the key names to update.").Input(1

or the global with the values of counters for the given keys DOC int64 tensor with the values to update	Input (2,"handle","If provided, update the given StatRegistry. ""Otherwise, update the global singleton.")

If export values from given StatRegistry export values from the global singleton StatRegistry	Output (0,"keys","1D string tensor with exported key names").Output(1

If export values from given StatRegistry export values from the global singleton StatRegistry int64 tensor with exported values	Output (2,"timestamps","The unix timestamp at counter retrieval.").Arg("reset"

if not set use output name	Output (0,"timer","(Tensor`<ptr>`): pointer to a timer object")

obtained from TimerBegin op	Output (0,"nanos","(Tensor`<int64>`): scalar tensor containing time in nanoseconds")

NumInputs(1).NumOutputs(1).SetDoc(R"DOC( Queries the current time of a timer object in nanoseconds. Github Links obtained from TimerBegin op	Output (0,"nanos","(Tensor`<int64>`): scalar containing time in nanoseconds")

	CAFFE_KNOWN_TYPE (TimerInstance *)

	CAFFE_KNOWN_TYPE (std::unique_ptr< caffe2::StatRegistry >)

	REGISTER_TEMPLATED_STAT_PUT_OP (AveragePut, AveragePutStat, CAFFE_AVG_EXPORTED_STAT).NumInputs(1).NumOutputs(0).Arg("name"

str then uses name of input blob	Arg ("magnitude_expand","(int64_t): number to multiply input values by (used when inputting floats, as stats can only receive integers").Arg("bound"

	REGISTER_TEMPLATED_STAT_PUT_OP (IncrementPut, IncrementPutStat, CAFFE_EXPORTED_STAT).NumInputs(1).NumOutputs(0).Arg("name"

	REGISTER_TEMPLATED_STAT_PUT_OP (StdDevPut, StdDevPutStat, CAFFE_STDDEV_EXPORTED_STAT).NumInputs(1).NumOutputs(0).Arg("name"

	REGISTER_CPU_OPERATOR (StopGradient, StopGradientOp< CPUContext >)

	NumInputs (1, 1).NumOutputs(1

	NO_GRADIENT (StopGradient)

	REGISTER_CUDA_OPERATOR (StopGradient, StopGradientOp< CUDAContext >)

	REGISTER_CPU_OPERATOR (StumpFunc, StumpFuncOp< float, float, CPUContext >)

tensor of float	Output (0,"Y","tensor of float").SetDoc(R"DOC( Converts each input element into either high_ or low_value based on the given threshold. )DOC")

	NO_GRADIENT (StumpFunc)

	REGISTER_CPU_OPERATOR (StumpFuncIndex, StumpFuncIndexOp< float, int64_t, CPUContext >)

tensor of float	Output (0,"Index_Low","tensor of int64 indices for elements below/equal threshold").Output(1

tensor of float tensor of int64 indices for elements above threshold	SetDoc (R"DOC( Split the elemnts and return the indices based on the given threshold. )DOC")

	NO_GRADIENT (StumpFuncIndex)

	REGISTER_CPU_OPERATOR (Summarize, SummarizeOp< float, CPUContext >)

	SetDoc (R"DOC( Summarize computes four statistics of the input tensor (Tensor)- min, max, mean and standard deviation. The output will be written to a 1-D tensor of size 4 if an output tensor is provided. Else, if the argument 'to_file' is greater than 0, the values are written to a log file in the root folder. )DOC").Arg("to_file"

default flag to indicate if the summarized statistics have to be written to a log file	Input (0,"data","The input data as Tensor.").Output(0

	SHOULD_NOT_DO_GRADIENT (Summarize)

	REGISTER_CPU_OPERATOR (Swish, UnaryElementwiseOp< TensorTypes< float >, CPUContext, SwishFunctor< CPUContext >>)

	REGISTER_CPU_OPERATOR (SwishGradient, SwishGradientOp< CPUContext >)

	SetDoc (R"DOC( SwishGradient takes X, Y and dY and uses this to update dX according to the chain rule and derivatives of the swish function. )DOC")

	REGISTER_GRADIENT (Swish, GetSwishGradient)

	REGISTER_CPU_OPERATOR (Tan, UnaryElementwiseOp< TensorTypes< float >, CPUContext, TanFunctor< CPUContext >>)

	REGISTER_CPU_OPERATOR (TanGradient, BinaryElementwiseOp< TensorTypes< float >, CPUContext, TanGradientFunctor< CPUContext >>)

	OPERATOR_SCHEMA (TanGradient).NumInputs(2).NumOutputs(1).IdenticalTypeAndShape()

	REGISTER_GRADIENT (Tan, GetTanGradient)

	REGISTER_CPU_OPERATOR (TanhGradient, BinaryElementwiseOp< TensorTypes< float >, CPUContext, TanhGradientFunctor< CPUContext >>)

	REGISTER_GRADIENT (Tanh, GetTanhGradient)

	REGISTER_CPU_OPERATOR (Tanh, UnaryElementwiseOp< TensorTypes< float >, CPUContext, TanhFunctor< CPUContext >>)

by providing the same input and output blobs Github workspace FeedBlob("X", np.random.randn(3, 3).astype(np.float32)) print("X	OPERATOR_SCHEMA (TanhGradient).NumInputs(2).NumOutputs(1).AllowInplace(

	REGISTER_CUDNN_OPERATOR (Tanh, CuDNNActivationOp< CUDNN_ACTIVATION_TANH >)

	REGISTER_CUDNN_OPERATOR (TanhGradient, CuDNNActivationGradientOp< CUDNN_ACTIVATION_TANH >)

	REGISTER_CPU_OPERATOR (TensorProtosDBInput, TensorProtosDBInput< CPUContext >)

INT_MAX	SetDoc (R"DOC( TensorProtosDBInput is a simple input operator that basically reads things from a db where each key-value pair stores an index as key, and a TensorProtos object as value. These TensorProtos objects should have the same size, and they will be grouped into batches of the given size. The DB Reader is provided as input to the operator and it returns as many output tensors as the size of the TensorProtos object. Each output will simply be a tensor containing a batch of data with size specified by the 'batch_size' argument containing data from the corresponding index in the TensorProtos objects in the DB. )DOC").Arg("batch_size"

INT_MAX default the number of samples in a batch The default value of means that the	operator will attempt to insert the" "entire data in a single output blob.") .Input (0,"data","A pre-initialized DB reader. Typically, this is obtained ""by calling CreateDB operator with a db_name and a db_type. The ""resulting output blob is a DB Reader tensor").Output(0

INT_MAX default the number of samples in a batch The default value of means that the The output tensor in which the batches of data are returned The number of output tensors is equal to the size	of (number of TensorProto's in) the TensorProtos objects stored in the" "DB as values.Each output tensor will be of size specified by the" "'batch_size'argument of the operator")

	NO_GRADIENT (TensorProtosDBInput)

	REGISTER_CUDA_OPERATOR (TensorProtosDBInput, TensorProtosDBInput< CUDAContext >)

void	convert (TensorProto_DataType dst_type, const char src_start, const char src_end, void *dst)

	CAFFE_KNOWN_TYPE (std::unique_ptr< TextFileReaderInstance >)

	REGISTER_CPU_OPERATOR (CreateTextFileReader, CreateTextFileReaderOp)

	REGISTER_CPU_OPERATOR (TextFileReaderRead, TextFileReaderReadOp)

Path to the file	Arg ("num_passes","Number of passes over the file.").Arg("field_types"

Path to the file List with type of each field Type enum is found at core DataType	Output (0,"handler","Pointer to the created TextFileReaderInstance.")

INT_MAX	SetDoc ("Read a batch of rows from the given text file reader instance. ""Expects the number of fields to be equal to the number of outputs. ""Each output is a 1D tensor containing the values for the given field ""for each row. When end of file is reached, returns empty tensors.").Input(0

INT_MAX Pointer to an existing TextFileReaderInstance	Arg ("batch_size","Maximum number of rows to read.")

	NO_GRADIENT (CreateTextFileReader)

	NO_GRADIENT (TextFileReaderRead)

	REGISTER_CPU_OPERATOR (ThresholdedRelu, ThresholdedReluOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (ThresholdedReluGradient, ThresholdedReluGradientOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (Tile, TileOp< CPUContext >)

	REGISTER_CPU_OPERATOR (TileGradient, TileGradientOp< CPUContext >)

	if (in.size() > 1)

out[0]	set_dims (canonical_axis, out[0].dims().Get(canonical_axis)*tiles)

	SetDoc (R"DOC( Constructs a tensor by tiling a given tensor along a specified axis. This operation creates a new tensor by replicating the input tensor a number of times specified by the `tiles` argument along the `axis` dimension. The output tensor's `axis` dimension has $(X.dims(axis) * tiles)$ elements. Github Links: - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/tile_op.cc <details> <summary> <b>Example</b> </summary> Code ``` workspace.ResetWorkspace() op = core.CreateOperator( "Tile", ["X", "tiles", "axis"], ["Y"] ) workspace.FeedBlob("X", np.random.randint(10, size=(5,5))) workspace.FeedBlob("tiles", np.array([5]).astype(np.int32)) workspace.FeedBlob("axis", np.array([1]).astype(np.int32)) print("X:", workspace.FetchBlob("X")) workspace.RunOperatorOnce(op) print("Y:", workspace.FetchBlob("Y")) ``` Result ``` X: [[9 1 7 1 3] [2 3 6 2 5] [0 9 2 6 4] [5 8 1 5 9] [2 0 1 3 7]] Y: [[9 1 7 1 3 9 1 7 1 3 9 1 7 1 3 9 1 7 1 3 9 1 7 1 3] [2 3 6 2 5 2 3 6 2 5 2 3 6 2 5 2 3 6 2 5 2 3 6 2 5] [0 9 2 6 4 0 9 2 6 4 0 9 2 6 4 0 9 2 6 4 0 9 2 6 4] [5 8 1 5 9 5 8 1 5 9 5 8 1 5 9 5 8 1 5 9 5 8 1 5 9] [2 0 1 3 7 2 0 1 3 7 2 0 1 3 7 2 0 1 3 7 2 0 1 3 7]] ``` </details> )DOC").Arg("tiles"

	OPERATOR_SCHEMA (TileGradient).NumInputs(1

	REGISTER_GRADIENT (Tile, GetTileGradient)

	REGISTER_CPU_OPERATOR (TopK, TopKOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (TopKGradient, TopKGradientOp< float, CPUContext >)

	TensorInferenceFunction ([](const OperatorDef &def, const vector< TensorShape > &in){vector< TensorShape > out={in[0], in[0]};ArgumentHelper helper(def);auto k=helper.GetSingleArgument("k",-1);auto dims_size=in[0].dims_size();out[0].set_dims(dims_size-1, k);out[1].set_dims(dims_size-1, k);out[1].set_data_type(TensorProto_DataType_INT32);if(def.output_size() > 2){TensorShape flatten_indices_shape;flatten_indices_shape.set_data_type(TensorProto_DataType_INT32);flatten_indices_shape.add_dims(std::accumulate(in[0].dims().begin(), in[0].dims().end()-1, 1, std::multiplies< long >())*k);out.push_back(flatten_indices_shape);}return out;}).SetDoc(R"DOC( Retrieve the top-K elements of the last dimension. Given an input tensor of shape $(a_1

r and integer argument return up to three k which contains the values of the top k elements along the last dimension Index tensor of	shape (a_1, a_2,..., a_n, k)$which contains the indices of the top k elements(original indices from the input tensor).3.[OPTIONAL] Flattened index tensor of shape $(a_1 a_2 ...a_n k

r and integer argument return up to three k which contains the values of the top k elements along the last dimension Index tensor of Given two equivalent this	operator uses the indices along the last dimension as a tiebreaker.That is, the element with the lower index will appear first.Github Links:-https:< details >< summary >< b >Example</b ></summary > Code ```workspace.ResetWorkspace () op

indices values refer to each element s index in the last dimension of the X input tensor	Output (2,"Flattened_indices","(Tensor`<int>`): tensor of indices of shape $(a_1 * a_2 * ... * a_n * k,)$; indices values refer to each element's index in the flattened input tensor `X`").Arg("k"

	OPERATOR_SCHEMA (TopKGradient).NumInputs(3).NumOutputs(1)

	REGISTER_GRADIENT (TopK, GetTopKGradient)

	REGISTER_CPU_OPERATOR (Transpose, TransposeOp< CPUContext >)

	if (axes.empty())

	CAFFE_ENFORCE (valid_axes,"Axes argument passed in had invalid values")

	CAFFE_ENFORCE (axes.size()==tensor_size,"Axes argument passed in had the incorrect size")

	for (auto axis=axes.begin();axis!=axes.end();++axis)

	SetDoc (R"DOC( Transpose the input tensor by permuting the axes of the input according to the `axes` argument. Similar to numpy's [transpose](https://docs.scipy.org/doc/numpy/reference/generated/numpy.transpose.html) function. For example, when axes=(1, 0, 2), given an input tensor of shape (1, 2, 3), the output shape will be (2, 1, 3). Github Links: - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/transpose_op.cc <details> <summary> <b>Example</b> </summary> Code ``` workspace.ResetWorkspace() op = core.CreateOperator( "Transpose", ["X"], ["Y"], axes=(0,3,1,2) ) x = np.random.rand(1,32,32,3) workspace.FeedBlob("X", x) print("X.shape(NHWC order):", workspace.FetchBlob("X").shape) workspace.RunOperatorOnce(op) print("Y.shape(NCHW order):", workspace.FetchBlob("Y").shape) ``` Result ``` X.shape (NHWC order): (1, 32, 32, 3) Y.shape (NCHW order): (1, 3, 32, 32) ``` </details> )DOC").Arg("axes"

	REGISTER_GRADIENT (Transpose, GetTransposeGradient)

	REGISTER_CUDNN_OPERATOR (Transpose, CuDNNTransposeOp)

	REGISTER_CPU_OPERATOR (Unique, UniqueOp< CPUContext >)

	SetDoc (R"DOC( Deduplicates input indices vector and optionally produces reverse remapping. There's no guarantees on the ordering of the output indices. )DOC").Input(0

tensor of int32 or int64 indices	Output (0,"unique_indices","1D tensor of deduped entries.").Output(1

tensor of int32 or int64 indices optional mapping from indices to unique_indices This has the same shape as indices Its elements are the indices into unique_indices such that	Gather (['unique_indices', 'remapping'])`" "yields`indices`.") .TensorInferenceFunction([]( const OperatorDef& def

	CAFFE_ENFORCE_EQ (in[0].dims_size(), 1)

	if (in[0].dims(0)<=1)

	if (def.output_size() > 1)

	SHOULD_NOT_DO_GRADIENT (Unique)

	REGISTER_CPU_OPERATOR (UpsampleBilinear, UpsampleBilinearOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (UpsampleBilinearGradient, UpsampleBilinearGradientOp< float, CPUContext >)

	REGISTER_GRADIENT (UpsampleBilinear, GetUpsampleBilinearGradient)

std::vector< TensorShape >	WeightedSumShapeInference (const OperatorDef &, const vector< TensorShape > &in)

OpSchema::Cost	CostInferenceForWeightedSum (const OperatorDef &, const vector< TensorShape > &in)

	REGISTER_CPU_OPERATOR (WallClockTime, WallClockTimeOp< CPUContext >)

	REGISTER_CPU_OPERATOR (Print, PrintOp< CPUContext >)

	REGISTER_CPU_OPERATOR (FlattenToVec, FlattenToVecOp< CPUContext >)

	REGISTER_CPU_OPERATOR (Alias, AliasOp< CPUContext >)

	REGISTER_CPU_OPERATOR (ResizeLike, ResizeLikeOp< CPUContext >)

	REGISTER_CPU_OPERATOR (SumInt, SumOp< CPUContext >)

	REGISTER_CPU_OPERATOR (WeightedSum, WeightedSumOp< CPUContext >)

	REGISTER_CPU_OPERATOR (WeightedSumGradient, WeightedSumGradientOp< CPUContext >)

	REGISTER_CPU_OPERATOR (ScatterWeightedSum, ScatterWeightedSumOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (ScatterAssign, ScatterAssignOp< CPUContext >)

	REGISTER_CPU_OPERATOR (LengthsToShape, LengthsToShapeOp< CPUContext >)

	REGISTER_CPU_OPERATOR (HasElements, HasElementsOp< CPUContext >)

	REGISTER_CPU_OPERATOR (GatherRanges, GatherRangesOp< CPUContext >)

	REGISTER_CPU_OPERATOR (LengthsGather, LengthsGatherOp< CPUContext >)

	REGISTER_CPU_OPERATOR (LengthsToSegmentIds, LengthsToSegmentIdsOp< CPUContext >)

	REGISTER_CPU_OPERATOR (LengthsToRanges, LengthsToRangesOp< CPUContext >)

	REGISTER_CPU_OPERATOR (SegmentIdsToLengths, SegmentIdsToLengthsOp< CPUContext >)

	REGISTER_CPU_OPERATOR (SegmentIdsToRanges, SegmentIdsToRangesOp< CPUContext >)

	REGISTER_CPU_OPERATOR (LengthsToWeights, LengthsToWeightsOp< CPUContext >)

	REGISTER_CPU_OPERATOR (EnsureDense, EnsureDenseOp< CPUContext >)

	REGISTER_CPU_OPERATOR (AccumulateHistogram, AccumulateHistogramOp< float, CPUContext >)

bool saves contents to the root folder of the current appending the tensor contents to a file named after the blob name logs to stderr	Arg ("limit","(int, default 0) If set, prints the first `limit` elements of tensor. ""If 0, prints the first `k_limit_default`(1000) elements of tensor").Arg("every_n"

bool saves contents to the root folder of the current appending the tensor contents to a file named after the blob name logs to stderr default Print tensor every every_n runs	Input (0,"tensor","The tensor to print.")

OpSchema::Cost	CostInferenceForSum (const OperatorDef &def, const std::vector< TensorShape > &in)

	REGISTER_CUDNN_OPERATOR (WeightedSum, CuDNNWeightedSumOp)

	REGISTER_CPU_OPERATOR (VariableLengthSequencePadding, VariableLengthSequencePaddingOp< float, CPUContext >)

	SetDoc (R"DOC( Super special-case operator. Used to pad a tensor to mimic pytorch's pad_packed_sequence. Given an input tensor INPUT of size NxBxM and an input tensor LENS of size B, where N = maximum sequence length B = batch size M = hidden size set each element of INPUT to zero if it is is past the end of the corresponding sequence (i.e. if LENS[j] > i for an index (i,j,k)). )DOC")

	REGISTER_CPU_OPERATOR (WeightedMultiSampling, WeightedMultiSamplingOp< CPUContext >)

	if (in[0].dims(0)==0)

	if (args.HasArgument("num_samples"))

	SetDoc (R"DOC( The operator performs sampling based on the input sampling weights. All weights are cummulative probability thus sorted. The output is a 1-D tensor (Tensor). If two inputs are given, the second input is used to provide shape of the output sample tensor. Otherwise, we use argument `num_samples` to determine the number of samples to generate. )DOC").Input(0

An optional D Tensor Input cumulative sampling	probability (such as[0.2, 0.5, 0.8, 1.5])." "All weights must be non-negative numbers.Note that the last value of" "CDF is not necessary 1.If the last value is not 1

An optional D Tensor Input cumulative sampling all values in sampling_cdf will be scaled by this number	Input (1,"shape_tensor (optional)","Tensor whose shape will be applied to output.").Output(0

An optional D Tensor Input cumulative sampling all values in sampling_cdf will be scaled by this number The output tensor contains indices sampled from distribution given by the weight vector in the input tensor The output is a D Tensor of size determined by argument num_samples or the second input tensor	Arg ("num_samples","number of samples to sample from the input data")

	SHOULD_NOT_DO_GRADIENT (WeightedMultiSample)

	REGISTER_CPU_OPERATOR (WeightedSample, WeightedSampleOp< float, CPUContext >)

	TensorInferenceFunction ([](const OperatorDef &def, const vector< TensorShape > &in){vector< TensorShape > out(2);int batch_size=in[0].dims(0);out[0]=CreateTensorShape(vector< int >{batch_size}, TensorProto::INT32);out[1]=CreateTensorShape(vector< int >{batch_size}, TensorProto::FLOAT);return out;}).SetDoc(R"DOC( The operator performs sampling based on the input sampling weights for each batch. All weights must be non-negative numbers. The input is a 2-D tensor (Tensor) of size (batch_size x weights_dim). For each batch

an index is randomly sampled from the distribution given by the weights of the corresponding batch The output is a D A D Tensor of	size (batch_size x weights_dim)." "All weights must be non-negative numbers.") .Input( 1

an index is randomly sampled from the distribution given by the weights of the corresponding batch The output is a D A D Tensor of An optional D Tensor of The output tensor contains	index (es) sampled from distribution given" "by the weight vector(s) in the input tensor" "The output is a 1-D Tensor of size(batch_size x 1)") .Output( 1

an index is randomly sampled from the distribution given by the weights of the corresponding batch The output is a D A D Tensor of An optional D Tensor of The output tensor contains The output tensor contains	value (s) selected by the sampled index(es)" "It is a 1-D Tensor of size(batch_size x 1)")

	SHOULD_NOT_DO_GRADIENT (WeightedSample)

	REGISTER_CPU_OPERATOR (While, WhileOp< CPUContext >)

INT_MAX	SetDoc (R"DOC( 'While' control operator, first input is a scalar boolean blob that stores loop's condition value. Accepts 'loop_net' (required) and 'cond_net' (optional) arguments for loop's body and condition subnets respectively. If condition subnet is specified, it is executed before the first and after each iteration. Subnets are executed in the same workspace as 'While'. )DOC").Arg("loop_net"

INT_MAX Net executed on each iteration	Arg ("cond_net","Net to (re)compute condition value").Input(0

	REGISTER_CUDA_OPERATOR (While, WhileOp< CUDAContext >)

	REGISTER_CPU_OPERATOR (ZeroGradient, ZeroGradientOp< CPUContext >)

	REGISTER_GRADIENT (ZeroGradient, GetZeroGradientOpGradient)

	REGISTER_CUDA_OPERATOR (ZeroGradient, ZeroGradientOp< CUDAContext >)

std::vector< TensorShape >	InferOutput (const OperatorDef &op, const std::vector< TensorShape > &input_shapes)

	C10_DEFINE_REGISTRY (ConverterRegistry, Converter)

repr::NeuralNetOperator::NNLayout	getLayout (std::map< std::string, caffe2::Argument > argMap)

std::vector< int >	getKernelShape (std::map< std::string, caffe2::Argument > argMap)

std::unique_ptr< repr::NeuralNetOperator >	convertToNeuralNetOperator (const caffe2::OperatorDef &op)

repr::NNModule	convertToNNModule (const caffe2::NetDef &net, bool strict, std::vector< repr::NNGraph::NodeRef > *opNodeVec)
	Ingest a caffe2 protobuf model and output an NNModule. More...

caffe2::OperatorDef	convertToOperatorDef (const repr::NNGraph::NodeRef &instrNode)

Caffe2Annotation *	getOrAddCaffe2Annotation (nom::repr::NNGraph::NodeRef &instrNode)

caffe2::NetDef	convertToCaffe2Proto (repr::NNModule &m)

std::vector< std::string >	mergeExternalTensors (const std::unordered_set< repr::NNGraph::NodeRef > &currExternal, const std::vector< std::string > &oldExternal)

caffe2::NetDef	convertToCaffe2Proto (repr::NNModule &m, const caffe2::NetDef &oldNet)

void	pushOpToFront (caffe2::OperatorDef &op, caffe2::NetDef *net)

void	injectDataEdgeIndicators (caffe2::NetDef *net)

void	removeDataEdgeIndicators (caffe2::NetDef *net)

CAFFE2_API nom::repr::NNModule	convertToNNModule (const caffe2::NetDef &net, bool strict=false, std::vector< nom::repr::NNGraph::NodeRef > *=nullptr)
	Ingest a caffe2 protobuf model and output an NNModule. More...

	C10_DECLARE_REGISTRY (ConverterRegistry, Converter)

void	setDeviceOption (NNGraph::NodeRef n, caffe2::DeviceOption &d)

void	addBlobDeviceOptions (std::map< std::string, caffe2::DeviceOption > blobMap, nom::repr::NNModule *nn)
	Helpers for the convertToNNModule for use if you already have an NNModule. More...

void	injectDataEdgeIndicators (nom::repr::NNModule *nn)

void	removeDataEdgeIndicators (nom::repr::NNModule *nn)

nom::repr::NNModule	convertToNNModule (caffe2::NetDef &, std::map< std::string, caffe2::DeviceOption >)
	Convert to an NNModule and apply a mapping of tensor names to DeviceOptions to it. More...

	C10_DEFINE_REGISTRY (WorkspaceOptimizationPassRegistry, WorkspaceOptimizationPass, NNModule , Workspace )

	C10_DEFINE_REGISTRY (OptimizationPassRegistry, OptimizationPass, NNModule *)

	C10_DECLARE_REGISTRY (WorkspaceOptimizationPassRegistry, WorkspaceOptimizationPass, NNModule , Workspace )

	C10_DECLARE_REGISTRY (OptimizationPassRegistry, OptimizationPass, NNModule *)

ShapeInfo	getShapeInfoFromBlob (const Blob *blob)

bool	operator== (const ShapeInfo &lhs, const ShapeInfo &rhs)

void	adagrad_update__base (int N, const float w, const float g, const float h, float nw, float *nh, float epsilon, float decay, const float lr)

void	adagrad_update_prefetch__base (int N, const float w, const float , const float g, const float h, const float , float nw, float , float nh, float *, float epsilon, float lr)

void	adagrad_fp16_update_prefetch__base (int N, const at::Half w, const at::Half , const float g, const at::Half h, const at::Half , at::Half nw, at::Half , at::Half nh, at::Half *, float epsilon, float lr)

void	rowwise_adagrad_update__base (int N, float w, float w_n, const float g, float h, float *h_n, float epsilon, float lr)

void	adagrad_update (int N, const float w, const float g, const float h, float nw, float *nh, float epsilon, float decay, float lr)

void	adagrad_update_prefetch (int N, const float w, const float w_n, const float g, const float h, const float h_n, float nw, float nw_n, float nh, float *nh_n, float epsilon, float lr)

void	adagrad_fp16_update_prefetch (int N, const at::Half w, const at::Half w_n, const float g, const at::Half h, const at::Half h_n, at::Half nw, at::Half nw_n, at::Half nh, at::Half *nh_n, float epsilon, float lr)

void	rowwise_adagrad_update (int N, float w, float w_n, const float g, float h, float *h_n, float epsilon, float lr)

	SPARSE_ADAGRAD_SPECIALIZATION (int32_t, base)

template<>
int	sparse_adagrad (int num_rows, int block_size, uint64_t param_size, const float w, const float g, const float h, const int32_t indices, float nw, float nh, float epsilon, float lr)

	SPARSE_ADAGRAD_SPECIALIZATION (int64_t, base)

template<>
int	sparse_adagrad (int num_rows, int block_size, uint64_t param_size, const float w, const float g, const float h, const int64_t indices, float nw, float nh, float epsilon, float lr)

template<typename SIndex >
int	sparse_adagrad (int num_rows, int block_size, std::uint64_t param_size, const float w, const float g, const float h, const SIndex indices, float nw, float nh, float epsilon, float lr)

void	adagrad_update__avx_f16c (int N, const float w, const float g, const float h, float nw, float *nh, float epsilon, float decay, float lr)

void	adagrad_update_prefetch__avx_f16c (int N, const float w, const float w_n, const float g, const float h, const float h_n, float nw, float nw_n, float nh, float *nh_n, float epsilon, float lr)

void	adagrad_fp16_update_prefetch__avx_f16c (int N, const at::Half w, const at::Half w_n, const float g, const at::Half h, const at::Half h_n, at::Half nw, at::Half nw_n, at::Half nh, at::Half *nh_n, float epsilon, float lr)

void	rowwise_adagrad_update__avx_f16c (int N, float w, float w_n, const float g, float h, float *h_n, float epsilon, float lr)

	SPARSE_ADAGRAD_SPECIALIZATION (int32_t, avx_f16c)

	SPARSE_ADAGRAD_SPECIALIZATION (int64_t, avx_f16c)

	EMBEDDING_SPECIALIZATION (int32_t, float, float, float, false)

	EMBEDDING_SPECIALIZATION (int64_t, float, float, float, false)

	EMBEDDING_SPECIALIZATION (int32_t, half, at::Half, float, false)

	EMBEDDING_SPECIALIZATION (int64_t, half, at::Half, float, false)

	EMBEDDING_SPECIALIZATION (int32_t, uint8_t, uint8_t, float, false)

	EMBEDDING_SPECIALIZATION (int64_t, uint8_t, uint8_t, float, false)

	EMBEDDING_SPECIALIZATION (int32_t, float, float, float, true)

	EMBEDDING_SPECIALIZATION (int64_t, float, float, float, true)

	EMBEDDING_SPECIALIZATION (int32_t, half, at::Half, float, true)

	EMBEDDING_SPECIALIZATION (int64_t, half, at::Half, float, true)

	EMBEDDING_SPECIALIZATION (int32_t, uint8_t, uint8_t, float, true)

	EMBEDDING_SPECIALIZATION (int64_t, uint8_t, uint8_t, float, true)

template<typename IndexType , typename InType , typename OutType , bool IS_WEIGHT_POSITIONAL = false>
void	EmbeddingLookup (const std::int64_t block_size, const std::int64_t output_size, const std::int64_t index_size, const std::int64_t data_size, const InType input, const IndexType indices, const int lengths, const float weights, const float scale_bias, bool normalize_by_lengths, OutType out)
	Embedding lookup with reduction. More...

bool	EmbeddingLookup_int32_t_float_float_false__avx2_fma (const int64_t block_size, const int64_t output_size, const int64_t index_size, const int64_t data_size, const float input, const int indices, const int lengths, const float weights, const float scale_bias, bool normalize_by_lengths, float out)

bool	EmbeddingLookup_int32_t_float_float_true__avx2_fma (const int64_t block_size, const int64_t output_size, const int64_t index_size, const int64_t data_size, const float input, const int indices, const int lengths, const float weights, const float scale_bias, bool normalize_by_lengths, float out)

bool	EmbeddingLookup_int64_t_float_float_false__avx2_fma (const int64_t block_size, const int64_t output_size, const int64_t index_size, const int64_t data_size, const float input, const int64_t indices, const int lengths, const float weights, const float scale_bias, bool normalize_by_lengths, float out)

bool	EmbeddingLookup_int64_t_float_float_true__avx2_fma (const int64_t block_size, const int64_t output_size, const int64_t index_size, const int64_t data_size, const float input, const int64_t indices, const int lengths, const float weights, const float scale_bias, bool normalize_by_lengths, float out)

bool	EmbeddingLookup_int32_t_half_float_false__avx2_fma (const int64_t block_size, const int64_t output_size, const int64_t index_size, const int64_t data_size, const at::Half input, const int indices, const int lengths, const float weights, const float scale_bias, bool normalize_by_lengths, float out)

bool	EmbeddingLookup_int32_t_half_float_true__avx2_fma (const int64_t block_size, const int64_t output_size, const int64_t index_size, const int64_t data_size, const at::Half input, const int indices, const int lengths, const float weights, const float scale_bias, bool normalize_by_lengths, float out)

bool	EmbeddingLookup_int64_t_half_float_false__avx2_fma (const int64_t block_size, const int64_t output_size, const int64_t index_size, const int64_t data_size, const at::Half input, const int64_t indices, const int lengths, const float weights, const float scale_bias, bool normalize_by_lengths, float out)

bool	EmbeddingLookup_int64_t_half_float_true__avx2_fma (const int64_t block_size, const int64_t output_size, const int64_t index_size, const int64_t data_size, const at::Half input, const int64_t indices, const int lengths, const float weights, const float scale_bias, bool normalize_by_lengths, float out)

bool	EmbeddingLookup_int32_t_uint8_t_float_false__avx2_fma (const int64_t block_size, const int64_t output_size, const int64_t index_size, const int64_t data_size, const uint8_t input, const int indices, const int lengths, const float weights, const float scale_bias, bool normalize_by_lengths, float out)

bool	EmbeddingLookup_int32_t_uint8_t_float_true__avx2_fma (const int64_t block_size, const int64_t output_size, const int64_t index_size, const int64_t data_size, const uint8_t input, const int indices, const int lengths, const float weights, const float scale_bias, bool normalize_by_lengths, float out)

bool	EmbeddingLookup_int64_t_uint8_t_float_false__avx2_fma (const int64_t block_size, const int64_t output_size, const int64_t index_size, const int64_t data_size, const uint8_t input, const int64_t indices, const int lengths, const float weights, const float scale_bias, bool normalize_by_lengths, float out)

bool	EmbeddingLookup_int64_t_uint8_t_float_true__avx2_fma (const int64_t block_size, const int64_t output_size, const int64_t index_size, const int64_t data_size, const uint8_t input, const int64_t indices, const int lengths, const float weights, const float scale_bias, bool normalize_by_lengths, float out)

bool	Fused8BitRowwiseEmbeddingLookup_int32_t_float_float_false__avx2_fma (const int64_t block_size, const int64_t output_size, const int64_t index_size, const int64_t data_size, const float input, const int indices, const int lengths, const float weights, bool normalize_by_lengths, float *out)

bool	Fused8BitRowwiseEmbeddingLookup_int32_t_float_float_true__avx2_fma (const int64_t block_size, const int64_t output_size, const int64_t index_size, const int64_t data_size, const float input, const int indices, const int lengths, const float weights, bool normalize_by_lengths, float *out)

bool	Fused8BitRowwiseEmbeddingLookup_int64_t_float_float_false__avx2_fma (const int64_t block_size, const int64_t output_size, const int64_t index_size, const int64_t data_size, const float input, const int64_t indices, const int lengths, const float weights, bool normalize_by_lengths, float *out)

bool	Fused8BitRowwiseEmbeddingLookup_int64_t_float_float_true__avx2_fma (const int64_t block_size, const int64_t output_size, const int64_t index_size, const int64_t data_size, const float input, const int64_t indices, const int lengths, const float weights, bool normalize_by_lengths, float *out)

bool	Fused8BitRowwiseEmbeddingLookup_int32_t_half_float_false__avx2_fma (const int64_t block_size, const int64_t output_size, const int64_t index_size, const int64_t data_size, const at::Half input, const int indices, const int lengths, const float weights, bool normalize_by_lengths, float *out)

bool	Fused8BitRowwiseEmbeddingLookup_int32_t_half_float_true__avx2_fma (const int64_t block_size, const int64_t output_size, const int64_t index_size, const int64_t data_size, const at::Half input, const int indices, const int lengths, const float weights, bool normalize_by_lengths, float *out)

bool	Fused8BitRowwiseEmbeddingLookup_int64_t_half_float_false__avx2_fma (const int64_t block_size, const int64_t output_size, const int64_t index_size, const int64_t data_size, const at::Half input, const int64_t indices, const int lengths, const float weights, bool normalize_by_lengths, float *out)

bool	Fused8BitRowwiseEmbeddingLookup_int64_t_half_float_true__avx2_fma (const int64_t block_size, const int64_t output_size, const int64_t index_size, const int64_t data_size, const at::Half input, const int64_t indices, const int lengths, const float weights, bool normalize_by_lengths, float *out)

bool	Fused8BitRowwiseEmbeddingLookup_int32_t_uint8_t_float_false__avx2_fma (const int64_t block_size, const int64_t output_size, const int64_t index_size, const int64_t data_size, const uint8_t input, const int indices, const int lengths, const float weights, bool normalize_by_lengths, float *out)

bool	Fused8BitRowwiseEmbeddingLookup_int32_t_uint8_t_float_true__avx2_fma (const int64_t block_size, const int64_t output_size, const int64_t index_size, const int64_t data_size, const uint8_t input, const int indices, const int lengths, const float weights, bool normalize_by_lengths, float *out)

bool	Fused8BitRowwiseEmbeddingLookup_int64_t_uint8_t_float_false__avx2_fma (const int64_t block_size, const int64_t output_size, const int64_t index_size, const int64_t data_size, const uint8_t input, const int64_t indices, const int lengths, const float weights, bool normalize_by_lengths, float *out)

bool	Fused8BitRowwiseEmbeddingLookup_int64_t_uint8_t_float_true__avx2_fma (const int64_t block_size, const int64_t output_size, const int64_t index_size, const int64_t data_size, const uint8_t input, const int64_t indices, const int lengths, const float weights, bool normalize_by_lengths, float *out)

	FUSED_8BIT_ROWWISE_EMBEDDING_SPECIALIZATION (int32_t, float)

	FUSED_8BIT_ROWWISE_EMBEDDING_SPECIALIZATION (int64_t, float)

template<typename IndexType , typename InType , typename OutType , bool IS_WEIGHT_POSITIONAL = false>
void	Fused8BitRowwiseEmbeddingLookup (const std::int64_t block_size, const std::int64_t output_size, const std::int64_t index_size, const std::int64_t data_size, const InType input, const IndexType indices, const int lengths, const float weights, bool normalize_by_lengths, OutType *out)
	Embedding lookup with reduction. More...

template<>
void	TypedAxpy< float, float > (int N, const float a, const float x, float y)

void	TypedAxpyHalffloat__base (int N, const float a, const at::Half x, float y)

template<>
void	TypedAxpy< at::Half, float > (int N, const float a, const at::Half x, float y)

void	TypedAxpy_uint8_float__base (int N, const float a, const std::uint8_t x, float y)

template<>
void	TypedAxpy< std::uint8_t, float > (int N, const float a, const std::uint8_t x, float y)

template<typename IN , typename OUT >
void	TypedAxpy (int N, const OUT a, const IN x, OUT y)

void	TypedAxpyHalffloat__avx_f16c (int N, const float a, const at::Half x, float y)

void	TypedAxpyHalffloat__avx2_fma (int N, const float a, const at::Half x, float y)

void	TypedAxpy_uint8_float__avx2_fma (int N, const float a, const std::uint8_t x, float y)

PredictorConfig	makePredictorConfig (const MetaNetDef &def, Workspace *parent, bool run_init)

PredictorConfig	makePredictorConfig (const NetDef &init_net, const NetDef &run_net, Workspace *parent, bool run_init, int optimization)

CAFFE2_API Workspace	makeWorkspace (std::shared_ptr< PredictorParameters > parameters)

CAFFE2_API DeviceType	ProtoToType (const caffe2::DeviceTypeProto p)

CAFFE2_API DeviceType	ProtoToType (int p)

CAFFE2_API DeviceTypeProto	TypeToProto (const DeviceType &t)

CAFFE2_API caffe2::DeviceOption	DeviceToOption (const at::Device &device)

CAFFE2_API at::Device	OptionToDevice (const caffe2::DeviceOption option)

void	ExtractDeviceOption (DeviceOption *device_option, const at::Device &device)

template<typename T >
void	FindMinMax (const T data, float min, float *max, int len)

template<>
void	FindMinMax< float > (const float data, float min, float *max, int len)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (BatchMatMul, DNNLOWP, BatchMatMulDNNLowPOp< uint8_t >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (BatchMatMul, DNNLOWP_16, BatchMatMulDNNLowPOp< uint16_t >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Int8BatchMatMul, DNNLOWP, BatchMatMulDNNLowPOp< uint8_t >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (BatchPermutation, DNNLOWP, BatchPermutationDNNLowPOp< uint8_t >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Int8BatchPermutation, DNNLOWP, BatchPermutationDNNLowPOp< uint8_t >)

	OPERATOR_SCHEMA (Int8BatchPermutation).NumInputs(2).NumOutputs(1)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (ChannelShuffle, DNNLOWP, ChannelShuffleDNNLowPOp< uint8_t >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Int8ChannelShuffle, DNNLOWP, ChannelShuffleDNNLowPOp< uint8_t >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (ChannelShuffle, DNNLOWP_16, ChannelShuffleDNNLowPOp< uint16_t >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Concat, DNNLOWP, ConcatDNNLowPOp< uint8_t >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Int8Concat, DNNLOWP, ConcatDNNLowPOp< uint8_t >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Conv, DNNLOWP_ACC16, ConvDNNLowPAcc16Op< false >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (ConvRelu, DNNLOWP_ACC16, ConvDNNLowPAcc16Op< true >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Int8Conv, DNNLOWP_ACC16, ConvDNNLowPAcc16Op< false >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Int8ConvRelu, DNNLOWP_ACC16, ConvDNNLowPAcc16Op< true >)

	OPERATOR_SCHEMA (ConvRelu).NumInputs(2

NumOutputs(1).TensorInferenceFunction(ConvPoolOpBase< CPUContext >	REGISTER_CPU_OPERATOR_WITH_ENGINE (Conv, DNNLOWP, ConvDNNLowPOp< uint8_t, false >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (ConvRelu, DNNLOWP, ConvDNNLowPOp< uint8_t, true >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Int8Conv, DNNLOWP, ConvDNNLowPOp< uint8_t, false >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Int8ConvRelu, DNNLOWP, ConvDNNLowPOp< uint8_t, true >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Conv, DNNLOWP_16, ConvDNNLowPOp< uint16_t, false >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (ConvRelu, DNNLOWP_16, ConvDNNLowPOp< uint16_t, true >)

	REGISTER_CPU_OPERATOR (ConvRelu, ConvReluOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Dequantize, DNNLOWP, DequantizeDNNLowPOp< std::uint8_t >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Dequantize, DNNLOWP_ROWWISE, DequantizeDNNLowPOp< std::uint8_t >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Dequantize, DNNLOWP_16, DequantizeDNNLowPOp< std::uint16_t >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Dequantize, DNNLOWP_ROWWISE_16, DequantizeDNNLowPOp< std::uint16_t >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Int8Dequantize, DNNLOWP, DequantizeDNNLowPOp< std::uint8_t >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Int8Dequantize, DNNLOWP_ROWWISE, DequantizeDNNLowPOp< std::uint8_t >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Int8DequantizeRowWise, DNNLOWP, DequantizeDNNLowPOp< std::uint8_t >)

int	dnnlowp_get_num_threads ()

int	dnnlowp_get_max_threads ()

int	dnnlowp_get_thread_num ()

std::pair< size_t, size_t >	Get1DPartition (size_t work, int nthreads, int tid, int work_align)

void	Get1DPartitionOf2D (int m, int n, int nthreads, int thread_id, int m_begin, int m_end, int n_begin, int n_end, int n_align=1)
	1D-partition m x n 2D work. More...

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Add, DNNLOWP, AddDNNLowPOp< uint8_t >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Int8Add, DNNLOWP, AddDNNLowPOp< uint8_t >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (ElementwiseLinear, DNNLOWP, ElementwiseLinearDNNLowPOp< uint8_t >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Int8ElementwiseLinear, DNNLOWP, ElementwiseLinearDNNLowPOp< uint8_t >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Mul, DNNLOWP, MulDNNLowPOp< uint8_t >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Int8Mul, DNNLOWP, MulDNNLowPOp< uint8_t >)

First of the input tensors Can be inplace	Output (0,"sum","Output tensor. Same dimension as inputs.")

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Sum, DNNLOWP, SumDNNLowPOp< uint8_t, false >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (SumRelu, DNNLOWP, SumDNNLowPOp< uint8_t, true >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Int8Sum, DNNLOWP, SumDNNLowPOp< uint8_t, false >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Int8SumRelu, DNNLOWP, SumDNNLowPOp< uint8_t, true >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Sum, DNNLOWP_16, SumDNNLowPOp< uint16_t, false >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (SumRelu, DNNLOWP_16, SumDNNLowPOp< uint16_t, true >)

	REGISTER_CPU_OPERATOR (SumRelu, SumReluOp< CPUContext >)

template<typename ACC_T >
shared_ptr< fbgemm::PackBMatrix< int8_t, ACC_T > >	GetOrCreateFbgemmPackBMatrix (fbgemm::matrix_op_t trans, int32_t m, int32_t n, const void orig_data, const int8_t quantized_data, int32_t ld)

template shared_ptr< fbgemm::PackBMatrix< int8_t, int16_t > >	GetOrCreateFbgemmPackBMatrix< int16_t > (fbgemm::matrix_op_t trans, int32_t m, int32_t n, const void orig_data, const int8_t quantized_data, int32_t ld)

template shared_ptr< fbgemm::PackBMatrix< int8_t, int32_t > >	GetOrCreateFbgemmPackBMatrix< int32_t > (fbgemm::matrix_op_t trans, int32_t m, int32_t n, const void orig_data, const int8_t quantized_data, int32_t ld)

template<typename ACC_T >
std::shared_ptr< fbgemm::PackBMatrix< int8_t, ACC_T > >	GetOrCreateFbgemmPackBMatrix (fbgemm::matrix_op_t trans, std::int32_t m, std::int32_t n, const void orig_data, const std::int8_t quantized_data, std::int32_t ld)
	If there's an existing packed matrix for the same matrix, reuse it. More...

template<typename T >
void	QuantizeWeight (const Blob &blob, int kernel_dim, int M, vector< TensorQuantizationParams > &qparams, vector< typename make_signed< T >::type > &W_quantized, dnnlowp::QuantizationFactory *qfactory)

template void	QuantizeWeight< uint8_t > (const Blob &blob, int kernel_dim, int M, vector< TensorQuantizationParams > &qparams, vector< int8_t > &W_quantized, dnnlowp::QuantizationFactory *qfactory)

template void	QuantizeWeight< uint16_t > (const Blob &blob, int kernel_dim, int M, vector< TensorQuantizationParams > &qparams, vector< int16_t > &W_quantized, dnnlowp::QuantizationFactory *qfactory)

template<typename T >
void	ComputeColumnOffsets (int num_rows, int num_cols, const T *W, const vector< TensorQuantizationParams > &qparams, vector< int32_t > &col_offsets)

template void	ComputeColumnOffsets< int8_t > (int num_rows, int num_cols, const int8_t *W, const vector< TensorQuantizationParams > &qparams, vector< int32_t > &col_offsets)

template void	ComputeColumnOffsets< int16_t > (int num_rows, int num_cols, const int16_t *W, const vector< TensorQuantizationParams > &qparams, vector< int32_t > &col_offsets)

fbgemm::CompressedSparseColumn *	ExtractOutlierMatrix (int groups, int kernel_dim, int M, int nbits_in_non_outlier, vector< int8_t > &W_quantized)

	CAFFE_KNOWN_TYPE (Int8FCDNNLowPPackedWeightBlob)

	CAFFE_KNOWN_TYPE (Int8ConvDNNLowPPackedWeightBlob)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Int8FCPackWeight, DNNLOWP, FullyConnectedDNNLowPPackWeightOp)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Int8FCPackWeight, DNNLOWP_ACC16, FullyConnectedDNNLowPPackWeightOp)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Int8FCPackWeight, DNNLOWP_ROWWISE, FullyConnectedDNNLowPPackWeightOp)

Weight tensor in KRSC layout	Input (1,"b","Bias tensor").Output(0

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Int8ConvPackWeight, DNNLOWP, ConvDNNLowPPackWeightOp)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Int8ConvPackWeight, DNNLOWP_ACC16, ConvDNNLowPPackWeightOp)

template<typename T >
void	QuantizeWeight (const Blob &blob, int kernel_dim, int M, vector< dnnlowp::TensorQuantizationParams > &qparams, vector< typename std::make_signed< T >::type > &w_quantized, dnnlowp::QuantizationFactory *qfactory)

fbgemm::CompressedSparseColumn *	ExtractOutlierMatrix (int groups, int kernel_dim, int M, int nbits_in_non_outlier, vector< std::int8_t > &W_quantized)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (FC, DNNLOWP_ACC16, FullyConnectedDNNLowPAcc16Op)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Int8FC, DNNLOWP_ACC16, FullyConnectedDNNLowPAcc16Op)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (FC, DNNLOWP, FullyConnectedDNNLowPOp< uint8_t >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (FC, DNNLOWP_16, FullyConnectedDNNLowPOp< uint16_t >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Int8FC, DNNLOWP, FullyConnectedDNNLowPOp< uint8_t >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (FC, DNNLOWP_ROWWISE, FullyConnectedDNNLowPOp< uint8_t >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (FC, DNNLOWP_ROWWISE_16, FullyConnectedDNNLowPOp< uint16_t >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Int8FC, DNNLOWP_ROWWISE, FullyConnectedDNNLowPOp< uint8_t >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (FC, FAKE_FP16, FullyConnectedFakeLowpFPOp< fp32_to_fp16, CPUContext >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (FCGradient, FAKE_FP16, FullyConnectedGradientFakeLowpFPOp< fp32_to_fp16, CPUContext >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (FC, FAKE_BFP_16, FullyConnectedFakeLowpFPOp< fp32_to_bfp16, CPUContext >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (FCGradient, FAKE_BFP_16, FullyConnectedGradientFakeLowpFPOp< fp32_to_bfp16, CPUContext >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (FC, FAKE_BFP_24, FullyConnectedFakeLowpFPOp< fp32_to_bfp24, CPUContext >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (FCGradient, FAKE_BFP_24, FullyConnectedGradientFakeLowpFPOp< fp32_to_bfp24, CPUContext >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (FC, FAKE_BFP_14, FullyConnectedFakeLowpFPOp< fp32_to_bfp14, CPUContext >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (FCGradient, FAKE_BFP_14, FullyConnectedGradientFakeLowpFPOp< fp32_to_bfp14, CPUContext >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (FC, FAKE_BFP_16_ROUND, FullyConnectedFakeLowpFPOp< fp32_to_bfp16_round, CPUContext >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (FCGradient, FAKE_BFP_16_ROUND, FullyConnectedGradientFakeLowpFPOp< fp32_to_bfp16_round, CPUContext >)

void	fp32_to_bfp16 (const float source, size_t size, float dest)

void	fp32_to_bfp24 (const float source, size_t size, float dest)

void	fp32_to_bfp14 (const float source, size_t size, float dest)

void	fp32_to_bfp16_scalar (const float source, size_t size, float dest)

void	fp32_to_fp16 (const float source, size_t size, float dest)

void	fp32_to_bfp16_round (const float source, size_t size, float dest)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (GroupNorm, DNNLOWP, GroupNormDNNLowPOp< uint8_t >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Int8GroupNorm, DNNLOWP, GroupNormDNNLowPOp< uint8_t >)

	OPERATOR_SCHEMA (Int8GroupNorm).NumInputs(3).NumOutputs(

	REGISTER_CPU_OPERATOR_WITH_ENGINE (LSTMUnit, DNNLOWP, LSTMUnitDNNLowPOp< uint8_t >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Int8LSTMUnit, DNNLOWP, LSTMUnitDNNLowPOp< uint8_t >)

template<typename T >
void	StoreMatrixInMatrixMarketFormat (int m, int n, const T *a, const std::string &matrix_name)

void	max_pool_avx2 (const uint8_t Xdata, int n, int height, int width, int channels, int pooled_height, int pooled_width, int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_t, int pad_l, uint8_t Ydata)

void	max_pool_avx2 (const std::uint8_t Xdata, int n, int height, int width, int channels, int pooled_height, int pooled_width, int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_t, int pad_l, std::uint8_t Ydata)
	Optimized using AVX2 intrinsics for max pool 2D in NHWC layout.

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Quantize, DNNLOWP, QuantizeDNNLowPOp< uint8_t >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Quantize, DNNLOWP_ROWWISE, QuantizeDNNLowPOp< uint8_t >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Quantize, DNNLOWP_16, QuantizeDNNLowPOp< uint16_t >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Quantize, DNNLOWP_ROWWISE_16, QuantizeDNNLowPOp< uint16_t >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Int8Quantize, DNNLOWP, QuantizeDNNLowPOp< uint8_t >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Int8Quantize, DNNLOWP_ROWWISE, QuantizeDNNLowPOp< uint8_t >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Relu, DNNLOWP, ReluDNNLowPOp< uint8_t >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Relu, DNNLOWP_16, ReluDNNLowPOp< uint16_t >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Int8Relu, DNNLOWP, ReluDNNLowPOp< uint8_t >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Int8ResizeNearest, DNNLOWP, ResizeNearestDNNLowPOp< uint8_t >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Sigmoid, DNNLOWP, UnaryElementwiseWithArgsDNNLowPOp< std::uint8_t, SigmoidFunctor< std::uint8_t >>)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Int8Sigmoid, DNNLOWP, UnaryElementwiseWithArgsDNNLowPOp< std::uint8_t, SigmoidFunctor< std::uint8_t >>)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (SpatialBN, DNNLOWP, SpatialBNDNNLowPOp< uint8_t >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Int8SpatialBN, DNNLOWP, SpatialBNDNNLowPOp< uint8_t >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Tanh, DNNLOWP, UnaryElementwiseWithArgsDNNLowPOp< std::uint8_t, TanhFunctor< std::uint8_t >>)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Int8Tanh, DNNLOWP, UnaryElementwiseWithArgsDNNLowPOp< std::uint8_t, TanhFunctor< std::uint8_t >>)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Gather, DNNLOWP, GatherDNNLowPOp< uint8_t >)

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Int8Gather, DNNLOWP, GatherDNNLowPOp< uint8_t >)

	CAFFE_KNOWN_TYPE (std::shared_ptr< BlobsQueue >)

	REGISTER_CPU_OPERATOR (CreateBlobsQueue, CreateBlobsQueueOp< CPUContext >)

	REGISTER_CPU_OPERATOR (EnqueueBlobs, EnqueueBlobsOp< CPUContext >)

	REGISTER_CPU_OPERATOR (DequeueBlobs, DequeueBlobsOp< CPUContext >)

	REGISTER_CPU_OPERATOR (CloseBlobsQueue, CloseBlobsQueueOp< CPUContext >)

	REGISTER_CPU_OPERATOR (SafeEnqueueBlobs, SafeEnqueueBlobsOp< CPUContext >)

	REGISTER_CPU_OPERATOR (SafeDequeueBlobs, SafeDequeueBlobsOp< CPUContext >)

	REGISTER_CPU_OPERATOR (WeightedSampleDequeueBlobs, WeightedSampleDequeueBlobsOp< CPUContext >)

	OPERATOR_SCHEMA (CreateBlobsQueue).NumInputs(0).NumOutputs(1)

	NumInputsOutputs ([](int inputs, int outputs){return inputs >=2 &&outputs >=1 &&inputs==outputs+1;}).EnforceInplace([](int input

	NumInputsOutputs ([](int inputs, int outputs){return inputs==1 &&outputs >=1;}).SetDoc(R"DOC( Dequeue the blobs from queue. )DOC").Arg("timeout_secs"

Timeout in The shared pointer for the BlobsQueue	Output (0,"blob","The blob to store the dequeued data")

	OPERATOR_SCHEMA (CloseBlobsQueue).NumInputs(1).NumOutputs(0)

	NumInputsOutputs ([](int inputs, int outputs){return inputs >=2 &&outputs >=2 &&inputs==outputs;}).EnforceInplace([](int input

	SetDoc (R"DOC( Enqueue the blobs into queue. When the queue is closed and full, the output status will be set to true which can be used as exit criteria for execution step. The 1st input is the queue and the last output is the status. The rest are data blobs. )DOC").Input(0

	NumInputsOutputs ([](int inputs, int outputs){return inputs==1 &&outputs >=2;}).SetDoc(R"DOC( Dequeue the blobs from queue. When the queue is closed and empty

the output status will be set to true which can be used as exit criteria for execution step The input is the queue and the last output is the status The rest are data blobs DOC	Arg ("num_records","(default 1) If > 1, multiple records will be dequeued and tensors ""for each column will be concatenated. This requires all tensors in ""the records to be at least 1D, and to have the same inner dimensions.").Input(0

INT_MAX	SetDoc (R"DOC( Dequeue the blobs from multiple queues. When one of queues is closed and empty, the output status will be set to true which can be used as exit criteria for execution step. The 1st input is the queue and the last output is the status. The rest are data blobs. )DOC").Arg("weights"

INT_MAX Weights for sampling from multiple queues	Arg ("table_idx_blob","The index of the blob (among the output blob list) ""that will be used to store the index of the table chosen to read the ""current batch.")

	NO_GRADIENT (CreateBlobsQueue)

	NO_GRADIENT (EnqueueBlobs)

	NO_GRADIENT (DequeueBlobs)

	NO_GRADIENT (CloseBlobsQueue)

	NO_GRADIENT (SafeEnqueueBlobs)

	NO_GRADIENT (SafeDequeueBlobs)

	NO_GRADIENT (WeightedSampleDequeueBlobs)

	REGISTER_CUDA_OPERATOR (CreateBlobsQueue, CreateBlobsQueueOp< CUDAContext >)

	REGISTER_CUDA_OPERATOR (EnqueueBlobs, EnqueueBlobsOp< CUDAContext >)

	REGISTER_CUDA_OPERATOR (DequeueBlobs, DequeueBlobsOp< CUDAContext >)

	REGISTER_CUDA_OPERATOR (CloseBlobsQueue, CloseBlobsQueueOp< CUDAContext >)

	REGISTER_CUDA_OPERATOR (SafeEnqueueBlobs, SafeEnqueueBlobsOp< CUDAContext >)

	REGISTER_CUDA_OPERATOR (SafeDequeueBlobs, SafeDequeueBlobsOp< CUDAContext >)

	CAFFE_KNOWN_TYPE (RebatchingQueuePtr)

	REGISTER_CPU_OPERATOR (Adadelta, AdadeltaOp< CPUContext >)

	SetDoc (R"DOC( Computes the AdaDelta update (https://arxiv.org/abs/1212.5701) for an input gradient and accumulated history of squared gradients. Concretely, given inputs (param, moment, moment_delta, grad, learning_rate), computes: new_moment = moment * decay + square(grad) * (1 - decay) new_grad = sqrt(moment_delta + epsilon) / sqrt(new_moment + epsilon) * grad new_param = param + learning_rate * new_grad new_moment_delta = moment_delta * decay + square(new_grad) * (1 - decay) and returns (new_param, new_moment, new_moment_delta). )DOC").Input(0

Parameters to be updated	Input (1,"moment","Average of squared gradients").Input(2

Parameters to be updated Average of squared parameter updates	Input (3,"grad","Gradient computed").Input(4

Parameters to be updated Average of squared parameter updates Learning rate	Output (0,"output_param","Updated parameters").Output(1

Parameters to be updated Average of squared parameter updates Learning rate Updated average squared gradient	Output (2,"output_moment_delta","Updated average of squared parameter updates").Arg("epsilon"

Parameters to be updated Average of squared parameter updates Learning rate Updated average squared gradient Default	Arg ("decay","Default 0.95, the squared gradient sum is decayed by this factor.")

	REGISTER_CPU_OPERATOR (SparseAdadelta, SparseAdadeltaOp< CPUContext >)

	NumInputs (6).NumOutputs(3).EnforceOneToOneInplace().SetDoc(R"DOC( Given inputs (param

runs the dense AdaDelta update	on (param, grad, moment[indices], moment_delta[indices], lr)

runs the dense AdaDelta update and	returns (new_param, new_moment, new_moment_delta) as in the dense case.) DOC") .Input(0

runs the dense AdaDelta update and Parameters to be updated Average of squared parameter updates	Input (3,"indices","Sparse indices").Input(4

runs the dense AdaDelta update and Parameters to be updated Average of squared parameter updates Gradient computed	Input (5,"lr","learning rate").Output(0

runs the dense AdaDelta update and Parameters to be updated Average of squared parameter updates Gradient computed Updated parameters	Output (1,"output_moment","Updated average squared gradient").Output(2

runs the dense AdaDelta update and Parameters to be updated Average of squared parameter updates Gradient computed Updated parameters Updated average of squared parameter updates	Arg ("epsilon","Default 1e-5").Arg("decay"

	SHOULD_NOT_DO_GRADIENT (Adadelta)

	SHOULD_NOT_DO_GRADIENT (SparseAdadelta)

	REGISTER_CPU_OPERATOR (Adagrad, AdagradOp< float, CPUContext >)

	AllowInplace ({{0, 0},{1, 1}}).SetDoc(R"DOC( Computes the AdaGrad update for an input gradient and accumulated history. Concretely

given	inputs (param, grad, moment, learning_rate)

template<typename Context >
void	adagrad_update (int N, const float w, const float g, const float h, float nw, float nh, float epsilon, float decay, const float lr, Context *)

template<typename Context >
void	adagrad_update_output_effective_lr (int N, const float paramIn, const float gradIn, const float momentIn, float paramOut, float momentOut, float effectiveLROut, float epsilon, float decay, const float lr, Context )

template<typename Context >
void	adagrad_update_output_effective_lr_and_update (int N, const float paramIn, const float gradIn, const float momentIn, float paramOut, float momentOut, float effectiveLROut, float updateOut, float epsilon, float decay, const float lr, Context *)

	REGISTER_CPU_OPERATOR (Adam, AdamOp< float, CPUContext >)

	AllowInplace ({{0, 0},{1, 1},{2, 2}}).DeviceInferenceFunction([](const OperatorDef &def)

	SetDoc (R"DOC( Computes the Adam update (https://arxiv.org/abs/1412.6980) for an input gradient and momentum parameters. Concretely, given inputs (param, m1, m2, grad, lr, iters), t = iters + 1 correction_multiplier = sqrt(1 - power(beta2, t)) / (1 - power(beta1, t)) m1_o = (beta1 * m1) + (1 - beta1) * grad m2_o = (beta2 * m2) + (1 - beta2) * np.square(grad) grad_o = correction_multiplier * m1_o / \ (sqrt(m2_o) + epsilon) param_o = param + lr * grad_o and returns (param_o, m1_o, m2_o, grad_o), in which grad_o is an optional output )DOC").Input(0

Parameters to be updated	Input (1,"moment_1","First moment history").Input(2

Parameters to be updated Second moment history learning rate	Input (5,"iter","iteration number").Output(0

Parameters to be updated Second moment history learning rate Updated parameters	Output (1,"output_moment_1","Updated first moment").Output(2

Parameters to be updated Second moment history learning rate Updated parameters Updated second moment	Output (3,"output_grad","Optional Effective gradient").Arg("beta1"

Parameters to be updated Second moment history learning rate Updated parameters Updated second moment Default	Arg ("beta2","Default 0.999").Arg("epsilon"

	REGISTER_CPU_OPERATOR (SparseAdam, SparseAdamOp< float, CPUContext >)

	EnforceInplace ({{0, 0},{1, 1},{2, 2}}).SetDoc(R"DOC( Computes the Adam Update for the sparse case. Given inputs (param

runs the dense Adam	on (param, moment1[indices], momemnt2[indices], lr, iter) and returns(new_param

runs the dense Adam new_moment2 as in dense case DOC	Input (0,"param","Parameters to be updated").Input(1

runs the dense Adam new_moment2 as in dense case DOC First moment history	Input (2,"moment_2","Second moment history").Input(3

runs the dense Adam new_moment2 as in dense case DOC First moment history Sparse indices	Input (4,"grad","Gradient computed").Input(5

runs the dense Adam new_moment2 as in dense case DOC First moment history Sparse indices learning rate	Input (6,"iter","iteration number").Output(0

	REGISTER_CPU_OPERATOR (RowWiseSparseAdam, RowWiseSparseAdamOp< float, CPUContext >)

runs the Adam update	on (param, moment1[indices], moment2[indices], lr, iter) and returns(new_param

	SHOULD_NOT_DO_GRADIENT (Adam)

	SHOULD_NOT_DO_GRADIENT (SparseAdam)

	SHOULD_NOT_DO_GRADIENT (RowWiseSparseAdam)

template<typename Context >
void	adam_update (int N, const float g, const float m, const float v, float ng, float nm, float nv, float beta1, float beta2, float eps_hat, float correction, const float lr, Context )

template<typename Context >
void	adam_compute (int N, const float w, const float g, const float m, const float v, float nw, float nm, float nv, float beta1, float beta2, float eps_hat, float correction, const float lr, Context *)

template<typename Context >
void	adam_compute_output_grad (int N, const float w, const float g, const float m, const float v, float nw, float nm, float nv, float ng, float beta1, float beta2, float eps_hat, float correction, const float lr, Context )

	REGISTER_CPU_OPERATOR (ClipTensorByScaling, ClipTensorByScalingOp< CPUContext >)

	SetDoc (R"DOC( Clips the input tensor by scaling based on the input value and the threshold. The value is usually the (pre-computed) norm of the tensor. If the value is larger than the threshold, scaling would be performed in this way: tensor = (threshold / value). An optional input called additional_threshold can be provided which will scale the original threshold before it is used. That is, the final threshold will become threshold additional_threshold. This op could be used for gradient clipping. )DOC").Input(0

Tensor of floats to be clipped	Input (1,"val","Value to be compared against the threshold").Input(2

Tensor of floats to be clipped An optional additonal threshold to scale the orignal threshold	Arg ("threshold","Threshold to determine whether to scale down the tensor").Output(0

	SHOULD_NOT_DO_GRADIENT (ClipTensorByScaling)

template<class Context >
void	fp16_momentum_sgd_update (int N, const at::Half g, const at::Half m, at::Half ng, at::Half nm, const float lr, float momentum, bool nesterov, float weight_decay, bool fp32_update, at::Half param, Context *)

template<class Context >
void	fp32_momentum_sgd_update (int N, const float g, const float m, float ng, float nm, const float lr, float momentum, bool nesterov, float weight_decay, float param, Context *)

template<class T >
T	sgn (const T x)

template<typename T >
void	ftrl_compute (const T w, const T n, const T z, const T g, T &nw, T &nn, T &nz, const FtrlParams< T > &params)

template<typename Context , typename T >
void	ftrl_update (int N, const T w, const T nz, const T g, T new_w, T new_nz, const FtrlParams< T > &params, Context )

template<typename T >
void	gftrl_compute (const T &w, const T &n, const T &z, const T &g, T &nw, T &nn, T &nz, const T &z_norm, const int OutputDim, const GFtrlParams< T > &params)

template<typename Context , typename T >
void	gftrl_update (int OutputDim, int InputDim, const T w, const T nz, const T g, T new_w, T new_nz, const GFtrlParams< T > &params, Context )

	REGISTER_CPU_OPERATOR (Iter, IterOp< CPUContext >)

	REGISTER_CPU_OPERATOR (AtomicIter, AtomicIterOp< CPUContext >)

	REGISTER_BLOB_SERIALIZER ((TypeMeta::Id< std::unique_ptr< std::mutex >>()), MutexSerializer)

	REGISTER_BLOB_DESERIALIZER (std::unique_ptr< std::mutex >, MutexDeserializer)

	SetDoc (R"DOC( Stores a singe integer, that gets incremented on each call to Run(). Useful for tracking the iteration count during SGD, for example. )DOC")

	SetDoc (R"DOC( Similar to Iter, but takes a mutex as the first input to make sure that updates are carried out atomically. This can be used in e.g. Hogwild sgd algorithms. )DOC").Input(0

The mutex used to do atomic increment	Input (1,"iter","The iter counter as an int64_t TensorCPU.")

	NO_GRADIENT (Iter)

	NO_GRADIENT (AtomicIter)

void	IncrementIter (TensorCPU *output)

	REGISTER_CUDA_OPERATOR (Iter, IterOp< CUDAContext >)

	REGISTER_CUDA_OPERATOR (AtomicIter, AtomicIterOp< CUDAContext >)

	REGISTER_CPU_OPERATOR (Lars, LarsOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (LearningRateAdaption, LearningRateAdaptionOp< float, CPUContext >)

	SetDoc (R"DOC( Learning Rate Adaption is an operation that perform one iteration of gradient descent based on learning rate: lr(k) = lr(k-1) - lr_alpha * df(k-1)/dlr, where df(k-1)/dlr is the gradient of objective function f on lr, and lr_alpha is a learning rate hyperparameter. It can be prove that df(k-1)/dlr equals INNERPRODUCT(grad(k-1), -grad(k-2)), where grad(k-1) is the grad of f(k-1) on parameters. When the argument "normalized_lr_adaption" is false, we simply perform the following update: lr(k) = lr(k-1) - lr_alpha * INNERPRODUCT(grad(k-1), grad(k-2)). If we set "normalized_lr_adaption" to be true, we do not directly apply INNERPRODUCT(grad(k-1), -grad(k-2)) as the grad. Instead, we perform the following update: lr(k) = lr(k-1) + lr_alpha * cosineSimilarity(grad(k-1), grad(k-2)). )DOC").Arg("lr_alpha"

the learning rate for performing gradient descent on learning rate lr	Arg ("normalized_lr_adaption","whether to apply normalized lr adaption or not").Input(0

the learning rate for performing gradient descent on learning rate lr Learning rate	Input (1,"grad","Gradient computed").Input(2

the learning rate for performing gradient descent on learning rate lr Learning rate The effective grad	Output (0,"output_lr","Updated learning rate")

	NO_GRADIENT (LearningRateAdaption)

template<typename Context >
void	lr_update (int n, const float grad, const float effgrad, const float lr, float nlr, float lr_alpha, bool normalized_lr_adaption, Context *)

	REGISTER_CPU_OPERATOR (LearningRate, LearningRateOp< float, CPUContext >)

	REGISTER_CUDA_OPERATOR (LearningRate, LearningRateOp< float, CUDAContext >)

	REGISTER_CPU_OPERATOR (MomentumSGD, MomentumSGDOp< float, CPUContext >)

	TensorInferenceFunction ([](const OperatorDef &, const vector< TensorShape > &in){vector< TensorShape > out(2);out[0]=in[0];out[1]=in[1];return out;}).SetDoc(R"DOC( Computes a momentum SGD update for an input gradient and momentum parameters. Concretely

given	inputs (grad, m, lr) and parameters(momentum

given adjusted_gradient m_new Output	is (grad, momentum) Note the difference to MomemtumSGDUpdate

given adjusted_gradient m_new Output which actually performs the parameter	update (and is thus faster).) DOC")

	SHOULD_NOT_DO_GRADIENT (MomentumSGD)

	REGISTER_CPU_OPERATOR (MomentumSGDUpdate, MomentumSGDUpdateOp< float, CPUContext >)

	TensorInferenceFunction ([](const OperatorDef &, const vector< TensorShape > &in){vector< TensorShape > out(3);out[0]=in[0];out[1]=in[1];out[2]=in[3];return out;}).SetDoc(R"DOC( Performs a momentum SGD update for an input gradient and momentum parameters. Concretely

given	inputs (grad, m, lr, param) and arguments(momentum

given param	return ((1+momentum)m_new-momentum m, m_new, param) Output is(grad

	SHOULD_NOT_DO_GRADIENT (MomentumSGDUpdate)

	REGISTER_CPU_OPERATOR (SparseMomentumSGDUpdate, SparseMomentumSGDUpdateOp< float, CPUContext >)

	EnforceInplace ({{1, 1},{3, 2}}).TensorInferenceFunction([](const OperatorDef &

	SetDoc (R"DOC( Performs a momentum SGD update analogous to MomentumSGDUpdate, but using a GradientSlice and indices into the full param and momentum tables. Both param and momentum should be in-place (corresponding inputs and outputs should be the same blobs). )DOC").Input(0

GradientSlice with gradients for updated indices	Input (1,"moment","Momentum blob, same shape as param.").Input(2

GradientSlice with gradients for updated indices Learning rate	Input (3,"param","Full parameter blob.").Input(4

GradientSlice with gradients for updated indices Learning rate	Indices (in first dimension of param) where updates are performed.") .Output(0

GradientSlice with gradients for updated indices Learning rate Adjusted gradient	Output (1,"output_moment","Updated momentum.").Output(2

GradientSlice with gradients for updated indices Learning rate Adjusted gradient Updated parameter	Arg ("momentum","Momentum hyperparameter.").Arg("nesterov"

	SHOULD_NOT_DO_GRADIENT (SparseMomentumSGDUpdate)

template<typename Context >
void	momentum_sgd_update (const int N, const float g, const float m, float ng, float nm, const float lr, const float momentum, const bool nesterov, float param, Context *)

template<>
void	rmsprop_update< CPUContext > (int N, const float g, const float ms, const float mom, float ng, float nms, float nmom, float decay, float momentum, float epsilon, const float lr, CPUContext )

	REGISTER_CPU_OPERATOR (RmsProp, RmsPropOp< float, CPUContext >)

	SetDoc (R"DOC( Computes the RMSProp update (http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf). Concretely, given inputs (grad, mean_squares, mom, lr), computes: mean_squares_o = mean_squares + (1 - decay) * (square(grad) - mean_squares) mom_o = momentum * mom + lr * grad / sqrt(epsilon + mean_squares_o) grad_o = mom_o Returns (grad_o, mean_squares_o, mom_o). )DOC")

	SHOULD_NOT_DO_GRADIENT (RmsProp)

template<typename Context >
void	rmsprop_update (int N, const float g, const float ms, const float mom, float ng, float nms, float nmom, float decay, float momentum, float epsilon, const float lr, Context context)

	REGISTER_CPU_OPERATOR (Wngrad, WngradOp< float, CPUContext >)

template<typename Context >
void	wngrad_update (int N, const float w, const float g, const float h, float nw, float nh, float epsilon, const float lr, Context *)

template<typename Context >
void	wngrad_update_output_effective_lr (int N, const float paramIn, const float gradIn, const float seqBIn, float paramOut, float seqBOut, float effectiveLROut, float epsilon, const float lr, Context )

template<typename Context >
void	wngrad_update_output_effective_lr_and_update (int N, const float paramIn, const float gradIn, const float seqBIn, float paramOut, float seqBOut, float effectiveLROut, float updateOut, float epsilon, const float lr, Context *)

	REGISTER_CPU_OPERATOR (YellowFin, YellowFinOp< float, CPUContext >)

	NumInputs (10).NumOutputs(8).AllowInplace(

	SetDoc (R"DOC( Computes the YellowFin update (https://arxiv.org/abs/1706.03471) and performs momentum SGD optimization step. lr and mu are not being shared between parameters. curv_win, g_avg, g2_avg and scalars_memory are just auxiliary memory for computing moving averages (see the publication). Takes arguments beta: coefficient for moving averages, curv_win_width: timeframe when average squared gradient is being stored, epsilon: for numerical purposes, nesterov and zero_debias for debias of moving average. )DOC").Input(0

Parameters to be updated	Input (1,"moment","Momentum").Input(2

Parameters to be updated Learning rate	Input (3,"mu","Momentum coefficient").Input(4

Parameters to be updated Learning rate Memory for latest curvature ranges	Input (5,"g_avg","Moving average of gradient").Input(6

Parameters to be updated Learning rate Memory for latest curvature ranges Moving average of squared gradient	Input (7,"scalars_memory","Memory for stateful scalars").Input(8

Parameters to be updated Learning rate Memory for latest curvature ranges Moving average of squared gradient Gradient computed	Input (9,"iter","Iteration number").Output(0

Parameters to be updated Learning rate Memory for latest curvature ranges Moving average of squared gradient Gradient computed Parameters to be updated	Output (1,"output_moment","Momentum").Output(2

Parameters to be updated Learning rate Memory for latest curvature ranges Moving average of squared gradient Gradient computed Parameters to be updated Output learning rate	Output (3,"output_mu","Output momentum coefficient").Output(4

Parameters to be updated Learning rate Memory for latest curvature ranges Moving average of squared gradient Gradient computed Parameters to be updated Output learning rate Output memory for latest curvature ranges	Output (5,"output_g_avg","Output moving average of gradient").Output(6

Parameters to be updated Learning rate Memory for latest curvature ranges Moving average of squared gradient Gradient computed Parameters to be updated Output learning rate Output memory for latest curvature ranges Output moving average of squared gradient	Output (7,"output_scalars_memory","Output memory for stateful scalars").Arg("beta"

Parameters to be updated Learning rate Memory for latest curvature ranges Moving average of squared gradient Gradient computed Parameters to be updated Output learning rate Output memory for latest curvature ranges Output moving average of squared gradient Default	Arg ("curv_win_width","Default 20").Arg("epsilon"

Parameters to be updated Learning rate Memory for latest curvature ranges Moving average of squared gradient Gradient computed Parameters to be updated Output learning rate Output memory for latest curvature ranges Output moving average of squared gradient Default Default	Arg ("nesterov","Default false").Arg("zero_debias"

	SHOULD_NOT_DO_GRADIENT (YellowFin)

void	initNNPACK ()

	REGISTER_CPU_OPERATOR_WITH_ENGINE (Conv, NNPACK, NNPACKConvOp)

	REGISTER_CPU_OPERATOR (QuantDecompZstd, QuantDecompZstdOp)

INT_MAX	SetDoc (R"DOC( Decompress a set of tensors that are compressed using zstd. The data can be compressed using mutils.compress_data_list(), see quant_decomp_op_test.py for an example. The number of outputs depended on the input. )DOC").Input(0

INT_MAX Compressed data in	tensor (uint8_t)

	SHOULD_NOT_DO_GRADIENT (QuantDecompZstd)

bool	are_nodes_common (const Graph &g, int model_idx, int candidate_idx)

	REGISTER_TRANSFORM (CommonSubexpressionElimination, CommonSubexpressionEliminationTransform)

	REGISTER_TRANSFORM (ConvToNNPack, ConvToNNPackTransform)

bool	compare_ops (const OperatorDef &p_op, const OperatorDef &g_op, bool arg_match)

uint32_t	wipe_cache ()

const CpuId &	GetCpuId ()

template<class Map , typename Key = typename Map::key_type, typename Value = typename Map::mapped_type>
Map::mapped_type	get_default (const Map &map, const Key &key, Value &&dflt)

void	MurmurHash3_x86_32 (const void key, int len, uint32_t seed, void out)

void	MurmurHash3_x86_128 (const void key, const int len, uint32_t seed, void out)

void	MurmurHash3_x64_128 (const void key, const int len, const uint32_t seed, void out)

C10_EXPORT std::string	DeviceTypeName (const int32_t &d)

C10_EXPORT int	DeviceId (const DeviceOption &option)

C10_EXPORT bool	IsSameDevice (const DeviceOption &lhs, const DeviceOption &rhs)

C10_EXPORT bool	IsCPUDeviceType (int device_type)

C10_EXPORT bool	IsGPUDeviceType (int device_type)

C10_EXPORT bool	ReadStringFromFile (const char filename, string str)

C10_EXPORT bool	WriteStringToFile (const string &str, const char *filename)

C10_EXPORT string	ProtoDebugString (const Message &proto)

C10_EXPORT bool	ParseProtoFromLargeString (const string &str, Message *proto)

C10_EXPORT bool	ReadProtoFromTextFile (const char filename, Message proto)

C10_EXPORT void	WriteProtoToTextFile (const Message &proto, const char *filename)

C10_EXPORT bool	ReadProtoFromBinaryFile (const char filename, MessageLite proto)

C10_EXPORT void	WriteProtoToBinaryFile (const MessageLite &proto, const char *filename)

bool	operator== (const NetDef &l, const NetDef &r)

std::ostream &	operator<< (std::ostream &output, const NetDef &n)

template<>
C10_EXPORT Argument	MakeArgument (const string &name, const MessageLite &value)

C10_EXPORT bool	HasOutput (const OperatorDef &op, const std::string &output)

C10_EXPORT bool	HasInput (const OperatorDef &op, const std::string &input)

C10_EXPORT int	GetArgumentIndex (const google::protobuf::RepeatedPtrField< Argument > &args, const string &name)

C10_EXPORT const Argument &	GetArgument (const OperatorDef &def, const string &name)

C10_EXPORT const Argument &	GetArgument (const NetDef &def, const string &name)

C10_EXPORT bool	GetFlagArgument (const google::protobuf::RepeatedPtrField< Argument > &args, const string &name, bool default_value)

C10_EXPORT bool	GetFlagArgument (const OperatorDef &def, const string &name, bool default_value)

C10_EXPORT bool	GetFlagArgument (const NetDef &def, const string &name, bool default_value)

C10_EXPORT Argument *	GetMutableArgument (const string &name, const bool create_if_missing, OperatorDef *def)

bool	ReadProtoFromBinaryFile (const string filename, MessageLite *proto)

void	WriteProtoToBinaryFile (const MessageLite &proto, const string &filename)

bool	ReadProtoFromTextFile (const string filename, Message *proto)

void	WriteProtoToTextFile (const Message &proto, const string &filename)

bool	ReadProtoFromFile (const char filename, Message proto)

bool	ReadProtoFromFile (const string &filename, Message *proto)

template<class IterableInputs = std::initializer_list<string>, class IterableOutputs = std::initializer_list<string>, class IterableArgs = std::initializer_list<Argument>>
OperatorDef	CreateOperatorDef (const string &type, const string &name, const IterableInputs &inputs, const IterableOutputs &outputs, const IterableArgs &args, const DeviceOption &device_option=DeviceOption(), const string &engine="")

template<class IterableInputs = std::initializer_list<string>, class IterableOutputs = std::initializer_list<string>>
OperatorDef	CreateOperatorDef (const string &type, const string &name, const IterableInputs &inputs, const IterableOutputs &outputs, const DeviceOption &device_option=DeviceOption(), const string &engine="")

template<typename T >
CAFFE2_API Argument	MakeArgument (const string &name, const T &value)

template<typename T >
void	AddArgument (const string &name, const T &value, OperatorDef *def)

bool	operator== (const DeviceOption &dl, const DeviceOption &dr)

CAFFE2_APIconst::std::string &	GetEmptyStringAlreadyInited ()

void	ShutdownProtobufLibrary ()

std::vector< std::string >	split (char separator, const std::string &string)

std::string	trim (const std::string &str)

size_t	editDistance (const std::string &s1, const std::string &s2, size_t max_distance)

int32_t	editDistanceHelper (const char s1, size_t s1_len, const char s2, size_t s2_len, std::vector< size_t > &current, std::vector< size_t > &previous, std::vector< size_t > &previous1, size_t max_distance)

CAFFE2_API bool	StartsWith (const std::string &str, const std::string &prefix)

CAFFE2_API bool	EndsWith (const std::string &full, const std::string &ending)

int	Do256NOPs ()

template<typename T >
T	WaitForVariableChange (std::atomic< T > var, T initial_value, std::condition_variable cond, std::mutex *mutex)

void	OpticalFlowExtractor (const cv::Mat &prev_gray, const cv::Mat &curr_gray, const int flow_alg_type, cv::Mat &flow)

void	MergeOpticalFlow (cv::Mat &prev_flow, const cv::Mat &curr_flow)

void	MultiFrameOpticalFlowExtractor (const std::vector< cv::Mat > &grays, const int optical_flow_alg_type, cv::Mat &flow)

	REGISTER_CPU_OPERATOR (VideoInput, VideoInputOp< CPUContext >)

	TensorInferenceFunction ([](const OperatorDef &def, const vector< TensorShape > &){ArgumentHelper helper(def);int batch_size=helper.GetSingleArgument< int >("batch_size", 0);int clip_per_video=helper.GetSingleArgument< int >("clip_per_video", 1);int crop_height=helper.GetSingleArgument< int >("crop_height", helper.GetSingleArgument< int >("crop_size", 0));int crop_width=helper.GetSingleArgument< int >("crop_width", helper.GetSingleArgument< int >("crop_size", 0));int length_rgb=helper.GetSingleArgument< int >("length_rgb", 0);int channels_rgb=helper.GetSingleArgument< int >("channels_rgb", 3);int length_of=helper.GetSingleArgument< int >("length_of", 0);int channels_of=helper.GetSingleArgument< int >("channels_of", 2);bool get_rgb=helper.GetSingleArgument< bool >("get_rgb", true);bool get_optical_flow=helper.GetSingleArgument< bool >("get_optical_flow", false);bool do_multi_label=helper.GetSingleArgument< bool >("do_multi_label", false);bool get_video_id=helper.GetSingleArgument< bool >("get_video_id", false);int output_size=1;if(get_rgb){output_size++;}if(get_optical_flow){output_size++;}if(get_video_id){output_size++;}int index=0;vector< TensorShape > out(output_size);CHECK_GT(crop_height, 0);CHECK_GT(crop_width, 0);batch_size *=clip_per_video;if(get_rgb){out[index++]=CreateTensorShape(vector< int >{batch_size, channels_rgb, length_rgb, crop_height, crop_width}, TensorProto::FLOAT);}if(get_optical_flow){out[index++]=CreateTensorShape(vector< int >{batch_size, channels_of, length_of, crop_height, crop_width}, TensorProto::FLOAT);}if(!do_multi_label){out[index++]=CreateTensorShape(vector< int >{1, batch_size}, TensorProto::INT32);}else{int num_of_class=helper.GetSingleArgument< int >("num_of_class", 0);out[index++]=CreateTensorShape(vector< int >{batch_size, num_of_class}, TensorProto::INT32);}if(get_video_id){out[index]=CreateTensorShape(vector< int >{1, batch_size}, TensorProto::INT32);}return out;})

	NO_GRADIENT (VideoInput)

	REGISTER_CUDA_OPERATOR (VideoInput, VideoInputOp< CUDAContext >)

void	Saturation (float clip, const int length, const int crop_height, const int crop_width, const float alpha_rand, std::mt19937 randgen)

void	Brightness (float clip, const int length, const int crop_height, const int crop_width, const float alpha_rand, std::mt19937 randgen)

void	Contrast (float clip, const int length, const int crop_height, const int crop_width, const float alpha_rand, std::mt19937 randgen)

void	ColorJitter (float clip, const int length, const int crop_height, const int crop_width, const float saturation, const float brightness, const float contrast, std::mt19937 randgen)

void	ColorLighting (float clip, const int length, const int crop_height, const int crop_width, const float alpha_std, const std::vector< std::vector< float >> &eigvecs, const std::vector< float > &eigvals, std::mt19937 randgen)

void	ColorNormalization (float *clip, const int length, const int crop_height, const int crop_width, const int channels, const std::vector< float > &mean, const std::vector< float > &inv_std)

void	ClipTransformRGB (const unsigned char buffer_rgb, const int multi_crop_count, const int crop_height, const int crop_width, const int length_rgb, const int channels_rgb, const int sampling_rate_rgb, const int height, const int width, const int h_off, const int w_off, const int multi_crop_h_off, const int multi_crop_w_off, const bool mirror_me, const bool color_jitter, const float saturation, const float brightness, const float contrast, const bool color_lighting, const float color_lighting_std, const std::vector< std::vector< float >> &color_lighting_eigvecs, const std::vector< float > &color_lighting_eigvals, const std::vector< float > &mean_rgb, const std::vector< float > &inv_std_rgb, std::mt19937 randgen, float *transformed_clip)

void	ClipTransformOpticalFlow (const unsigned char buffer_rgb, const int crop_height, const int crop_width, const int length_of, const int channels_of, const int sampling_rate_of, const int height, const int width, const cv::Rect &rect, const int channels_rgb, const bool mirror_me, const int flow_alg_type, const int flow_data_type, const int frame_gap_of, const bool do_flow_aggregation, const std::vector< float > &mean_of, const std::vector< float > &inv_std_of, float transformed_clip)

void	FreeDecodedData (std::vector< std::unique_ptr< DecodedFrame >> &sampledFrames)

bool	DecodeMultipleClipsFromVideo (const char video_buffer, const std::string &video_filename, const int encoded_size, const Params &params, const int start_frm, const int clip_per_video, const bool use_local_file, int &height, int &width, std::vector< unsigned char > &buffer_rgb)

	REGISTER_CPU_OPERATOR (BatchPermutation, BatchPermutationOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (BatchPermutationGradient, BatchPermutationGradientOp< float, CPUContext >)

NumInputs(2).NumOutputs(1).SetDoc(R"DOC( Permute the batch elements of the input tensor X according to the permutation specified in the input indices. Warning gradient comptuation is only correct if indices is a permutation DOC	Input (0,"X","Tensor of at least 1D shape (N, D0, D1, ...).").Input(1

NumInputs(2).NumOutputs(1).SetDoc(R"DOC( Permute the batch elements of the input tensor X according to the permutation specified in the input indices. Warning gradient comptuation is only correct if indices is a permutation DOC tensor of type int with	shape (N,) specifying a valid permutation" "of the indices in[0

NumInputs(2).NumOutputs(1).SetDoc(R"DOC( Permute the batch elements of the input tensor X according to the permutation specified in the input indices. Warning gradient comptuation is only correct if indices is a permutation DOC tensor of type int with	N (inclusive).") .Output( 0

NumInputs(2).NumOutputs(1).SetDoc(R"DOC( Permute the batch elements of the input tensor X according to the permutation specified in the input indices. Warning gradient comptuation is only correct if indices is a permutation DOC tensor of type int with Tensor with the same shape as X where	the (D0, D1,...) dimensional" "batch elements of X are permuted according to the input indices.")

See BatchPermutation	Input (1,"dY","Gradient of forward output 0 (Y).").Output(0

	REGISTER_GRADIENT (BatchPermutation, GetBatchPermutationGradient)

	REGISTER_CPU_OPERATOR (GroupSpatialSoftmax, GroupSpatialSoftmaxOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (GroupSpatialSoftmaxGradient, GroupSpatialSoftmaxGradientOp< float, CPUContext >)

number of classes in each softmax group	Input (0,"scores","4D tensor of softmax inputs (called 'scores' or 'logits') with shape ""(N, C, H, W), where C = num_anchors * num_classes defines num_anchors ""groups of contiguous num_classes softmax inputs.").Output(0

See GroupSpatialSoftmax	Input (1,"d_probabilities","Gradient of forward output 0 (probabilities).").Output(0

See GroupSpatialSoftmax Gradient of forward	input (scores).")

	REGISTER_GRADIENT (GroupSpatialSoftmax, GetGroupSpatialSoftmaxGradient)

	REGISTER_CPU_OPERATOR (PSRoIPool, PSRoIPoolOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (PSRoIPoolGradient, PSRoIPoolGradientOp< float, CPUContext >)

Spatial scale of the input feature map X relative to the input image E if X has a stride of w r t the input image	Arg ("group_size","(int) default 1; pooled_h = pooled_w = group_size where pooled_{h,w} ""is the pooled output Y's height and width, respectively.").Arg("output_dim"

number of channels in the pooled which might be the number of classes is used for classification or if used for class agnostic bounding box regression	Input (0,"X","4D position sensitive feature map input of shape (N, C, H, W), where ""C = group_size*2 output_dim.").Input(1

	REGISTER_CPU_OPERATOR (RoIPoolF, RoIPoolFOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (RoIPoolFGradient, RoIPoolFGradientOp< float, CPUContext >)

Pooled output Y s width	Input (0,"X","4D feature map input of shape (N, C, H, W).").Input(1

	REGISTER_CPU_OPERATOR (SampleAs, SampleAsOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (SampleAsGradient, SampleAsGradientOp< float, CPUContext >)

Tensor of at least	shape (N,...).") .Input( 1

See SampleAs	Input (1,"labels","See SampleAs.").Input(2

	REGISTER_GRADIENT (SampleAs, GetSampleAsGradient)

	REGISTER_CPU_OPERATOR (SelectSmoothL1Loss, SelectSmoothL1LossOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (SelectSmoothL1LossGradient, SelectSmoothL1LossGradientOp< float, CPUContext >)

L2 to L1 transition point	Arg ("scale","(float) default 1.0; multiply the loss by this scale factor.").Input(0

L2 to L1 transition point tensor of bounding box regression predictions with	shape (N, 4 num_bbox_classes num_anchors, H, W).") .Input( 1

L2 to L1 transition point tensor of bounding box regression predictions with tensor of labels	shape (M, 4) for 4 contiguous channels starting" "at each of the M locations selected by the locations input.") .Input( 2

L2 to L1 transition point tensor of bounding box regression predictions with tensor of labels tensor of shape(M, 4) that identifies M 'select'locations" "encoded by the four colums the loss is divided by	max (1, normalizer).") .Output( 0

See SelectSmoothL1Loss	Input (1,"Y","See SelectSmoothL1Loss.").Input(2

See SelectSmoothL1Loss See SelectSmoothL1Loss	Input (3,"normalizer","See SelectSmoothL1Loss.").Input(4

See SelectSmoothL1Loss See SelectSmoothL1Loss Gradient of forward	output (loss).") .Output( 0

See SelectSmoothL1Loss See SelectSmoothL1Loss Gradient of forward Gradient of forward	input (Y_hat).")

	REGISTER_GRADIENT (SelectSmoothL1Loss, GetSelectSmoothL1LossGradient)

	REGISTER_CPU_OPERATOR (SigmoidCrossEntropyLoss, SigmoidCrossEntropyLossOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (SigmoidCrossEntropyLossGradient, SigmoidCrossEntropyLossGradientOp< float, CPUContext >)

multiply the loss by this scale factor	Arg ("normalize","(int) default 1; if true, divide the loss by the number of targets > ""-1.").Input(0

multiply the loss by this scale factor Tensor of predicted	logits (shape must be at least 1D).") .Input( 1

multiply the loss by this scale factor Tensor of predicted Tensor of targets of type int and same shape as logits X	Output (0,"loss","Scalar loss.")

See SigmoidCrossEntropyLoss	Input (1,"targets","See SigmoidCrossEntropyLoss.").Input(2

	REGISTER_GRADIENT (SigmoidCrossEntropyLoss, GetSigmoidCrossEntropyLossGradient)

	REGISTER_CPU_OPERATOR (SigmoidFocalLoss, SigmoidFocalLossOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (SigmoidFocalLossGradient, SigmoidFocalLossGradientOp< float, CPUContext >)

where N is the number of elements in the H and W are the height and and each of length num_classes For the binary form of Focal num_classes does not include the background category(So, for COCO, num_classes=80, not 81.) The binary form of focal loss is multiply the loss by this scale factor	Arg ("alpha","(float) default 0.25; Focal Loss's alpha hyper-parameter.").Arg("gamma"

Focal Loss s gamma hyper parameter	Arg ("num_classes","(int) default 80; number of classes (excluding background).").Input(0

Focal Loss s gamma hyper parameter tensor of sigmoid	inputs (called 'scores'or 'logits') with shape" "(N

See SigmoidFocalLoss	Input (1,"labels","See SigmoidFocalLoss.").Input(2

See SigmoidFocalLoss See SigmoidFocalLoss	Input (3,"d_loss","Gradient of forward output 0 (loss)").Output(0

See SigmoidFocalLoss See SigmoidFocalLoss Gradient of forward	input (logits)")

	REGISTER_GRADIENT (SigmoidFocalLoss, GetSigmoidFocalLossGradient)

	REGISTER_CPU_OPERATOR (SmoothL1Loss, SmoothL1LossOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (SmoothL1LossGradient, SmoothL1LossGradientOp< float, CPUContext >)

NumInputs(4).NumOutputs(1).SetDoc(R"DOC( Smooth L1 Loss is a minor variation of Huber loss in which the point of transition between L2 loss and L1 loss is adjustable by a hyper-parameter beta L2 to L1 transition point Tensor of	predictions (at least 1D).") .Input( 1

NumInputs(4).NumOutputs(1).SetDoc(R"DOC( Smooth L1 Loss is a minor variation of Huber loss in which the point of transition between L2 loss and L1 loss is adjustable by a hyper-parameter beta L2 to L1 transition point Tensor of Tensor of labels with the same shape as Y_hat	Input (2,"alpha_in","Tensor of inside weights with the same shape as Y.").Input(3

See SmoothL1Loss	Input (1,"Y","See SmoothL1Loss.").Input(2

See SmoothL1Loss See SmoothL1Loss	Input (3,"alpha_out","See SmoothL1Loss.").Input(4

	REGISTER_GRADIENT (SmoothL1Loss, GetSmoothL1LossGradient)

	REGISTER_CPU_OPERATOR (SoftmaxFocalLoss, SoftmaxFocalLossOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (SoftmaxFocalLossGradient, SoftmaxFocalLossGradientOp< float, CPUContext >)

where N is the number of elements in the H and W are the height and and where t is the	target (ground truth) class

Focal Loss s gamma hyper parameter	Arg ("num_classes","(int) default 81; number of classes in each softmax group.").Input(0

the loss is normalized by Scalar loss	Output (1,"probabilities","4D tensor of softmax probabilities with shape (N, C, H, W), where ""C = num_anchors * num_classes, and softmax was applied to each of the ""num_anchors groups; within a group the num_classes values sum to 1.")

See SoftmaxFocalLoss	Input (1,"labels","See SoftmaxFocalLoss.").Input(2

See SoftmaxFocalLoss See SoftmaxFocalLoss	Input (3,"probabilities","Output 1 from SoftmaxFocalLoss; See SoftmaxFocalLoss.").Input(4

	REGISTER_GRADIENT (SoftmaxFocalLoss, GetSoftmaxFocalLossGradient)

	REGISTER_CPU_OPERATOR (SpatialNarrowAs, SpatialNarrowAsOp< CPUContext >)

	REGISTER_CPU_OPERATOR (SpatialNarrowAsGradient, SpatialNarrowAsGradientOp< CPUContext >)

or input of	shape (N, H0, W0) or(N

or input of W0	Input (1,"B","3D or 4D input of shape (N, H1, W1) or (N, C, H1, W1), where H1 <= H0 ""and W1 <= W0.").Output(0

or input of W0 Sub window of A containing	rows (inclusive) and columns" "[0

or input of W0 Sub window of A containing	W1 (inclusive).")

See SpatialNarrowAs	Input (1,"B","See SpatialNarrowAs.").Input(2

See SpatialNarrowAs Gradient of forward	output (C).") .Output( 0

See SpatialNarrowAs Gradient of forward Gradient of forward	input (A)")

	REGISTER_GRADIENT (SpatialNarrowAs, SpatialNarrowAsGradient)

	REGISTER_CPU_OPERATOR (UpsampleNearest, UpsampleNearestOp< float, CPUContext >)

	REGISTER_CPU_OPERATOR (UpsampleNearestGradient, UpsampleNearestGradientOp< float, CPUContext >)

integer upsampling factor feature map of	shape (N, C, scale H, scale W)

	REGISTER_GRADIENT (UpsampleNearest, GetUpsampleNearestGradient)

	REGISTER_CPU_OPERATOR (Caffe2ModuleTestDynamicDummy, Caffe2ModuleTestDynamicDummyOp)

	OPERATOR_SCHEMA (Caffe2ModuleTestDynamicDummy)

	REGISTER_CAFFE2_EARLY_INIT_FUNCTION (registerGlobalPerfNetObserverCreator,&registerGlobalPerfNetObserverCreator,"Caffe2 net global observer creator")

	CAFFE2_MODULE (caffe2_rocksdb,"RocksDB implementation for caffe2::DB.")

Variables
DoRunWithOtherType2 typedef c10::Registry< std::string, std::unique_ptr< OperatorBase >, const OperatorDef &, Workspace * >	OperatorRegistry

const int	kCIFARSize = 32

const int	kCIFARImageNBytes = kCIFARSize * kCIFARSize * 3

const int	kCIFAR10BatchSize = 10000

const int	kCIFAR10TestDataSize = 10000

const int	kCIFAR10TrainBatches = 5

const int	kCIFAR100TrainDataSize = 50000

const int	kCIFAR100TestDataSize = 10000

constexpr auto	kTensorBlobType = "Tensor"

constexpr auto	kChunkIdSeparator = "#%"

constexpr int	kDefaultChunkSize = -1

constexpr int	kNoChunking = 0

std::atomic< bool >	g_caffe2_has_cuda_linked {false}

std::atomic< bool >	g_caffe2_has_hip_linked {false}

constexpr int	CAFFE_CUDA_NUM_THREADS = 128

constexpr int	CAFFE_CUDA_NUM_THREADS_2D_DIMX = 16

constexpr int	CAFFE_CUDA_NUM_THREADS_2D_DIMY = 16

constexpr int	CAFFE_MAXIMUM_NUM_BLOCKS = 4096

constexpr int	CAFFE_MAXIMUM_NUM_BLOCKS_2D_DIMX = 128

constexpr int	CAFFE_MAXIMUM_NUM_BLOCKS_2D_DIMY = 128

constexpr int	kCUDAGridDimMaxX = 2147483647

constexpr int	kCUDAGridDimMaxY = 65535

constexpr int	kCUDAGridDimMaxZ = 65535

constexpr int	kCUDATensorMaxDims = 8

constexpr int	MaxDeviceTypes

class CAFFE2_API	OperatorBase

	DoRunWithType2

std::function< void(const OperatorDef &)>	GetOperatorLogger ()

constexpr int	kCannotComputeNumOutputs = -1

constexpr auto	kQTensorBlobQType = "QTensor"

constexpr int	k_limit_default_ = 1000

constexpr auto	kBlobName = "blob_name"

constexpr auto	kAddValue = "add_value"

alternative key for the	handler

const int	CONV_ALGORITHM_AUTO = 0

const int	CONV_ALGORITHM_WINOGRAD = 1

const char *	kConvFusionDoc

	cpu_blob

	ideep_blob

INT_MAX batch_size images will be processed GPUs can optionally be used for part of the processing The following transformations are applied to the image A bounding box is applied to the initial Number of images to output for each run of the Whether or not to do color jitter Defaults to Image brightness scale used in color jittering Defaults to Whether or not to do color lighting Defaults to	Type

INT_MAX batch_size images will be processed GPUs can optionally be used for part of the processing The following transformations are applied to the image A bounding box is applied to the initial Number of images to output for each run of the Whether or not to do color jitter Defaults to Image brightness scale used in color jittering Defaults to Whether or not to do color lighting Defaults to Scale the size of the smallest dimension of the image to this Scale and minsize are mutually exclusive Must be larger than crop	If

INT_MAX batch_size images will be processed GPUs can optionally be used for part of the processing The following transformations are applied to the image A bounding box is applied to the initial Number of images to output for each run of the Whether or not to do color jitter Defaults to Image brightness scale used in color jittering Defaults to Whether or not to do color lighting Defaults to Scale the size of the smallest dimension of the image to this Scale and minsize are mutually exclusive Must be larger than crop both dimensions of the image will be set to minsize or	scale

	otherwise

the other dimension is proportionally scaled Defaults to Whether or not to mirror the image Defaults to Vector of means per color Standard deviation by which to normalize color channels Defaults to Bounding box coordinate Defaults Bounding box coordinate Defaults Bounding box coordinate Defaults Bounding box coordinate Defaults if the input is in Caffe format Defaults to Number of CPU decode transform threads Defaults to Name of the Type of The sizes of any outputs besides the data and shortest side desired for image resize Defaults to[-1,-1] or no random resize desired	data = in[0]

the other dimension is proportionally scaled Defaults to Whether or not to mirror the image Defaults to Vector of means per color Standard deviation by which to normalize color channels Defaults to Bounding box coordinate Defaults Bounding box coordinate Defaults Bounding box coordinate Defaults Bounding box coordinate Defaults if the input is in Caffe format Defaults to Number of CPU decode transform threads Defaults to Name of the Type of The sizes of any outputs besides the data and shortest side desired for image resize Defaults to[-1,-1] or no random resize desired Tensor containing the images additional	outputs

the other dimension is proportionally scaled Defaults to Whether or not to mirror the image Defaults to Vector of means per color Standard deviation by which to normalize color channels Defaults to Bounding box coordinate Defaults Bounding box coordinate Defaults Bounding box coordinate Defaults Bounding box coordinate Defaults if the input is in Caffe format Defaults to Number of CPU decode transform threads Defaults to Name of the Type of The sizes of any outputs besides the data and shortest side desired for image resize Defaults to[-1,-1] or no random resize desired Tensor containing the images additional Any outputs after the first will be Tensors read from the input	TensorProtos

const char *const	snpe_ffi_so = "libsnpe_ffi.so"

constexpr size_t	k2b1bXBits = 2

constexpr size_t	kL1CacheSizeBytes = 16 * 1024

constexpr size_t	kGEMMTileSize = 64

constexpr size_t	kGEMMTileDepthBytes = 16

element wise Github	Links

we add to	it

we first initialize the output tensor to all	zeros

we first initialize the output tensor to all and then do accumulation Any further calls to the	input

we first initialize the output tensor to all and then do accumulation Any further calls to the The input tensor that has to be accumulated to the output tensor If the output size is not the same as input	size = 1

we first initialize the output tensor to all and then do accumulation Any further calls to the The input tensor that has to be accumulated to the output tensor If the output size is not the same as input the output tensor is first reshaped and initialized to	zero

we first initialize the output tensor to all and then do accumulation Any further calls to the The input tensor that has to be accumulated to the output tensor If the output size is not the same as input the output tensor is first reshaped and initialized to and only	then

element wise DOC	output = 1.0

element wise DOC The arccosine of the input tensor computed element	wise

	RealBatchSizeIn

Real batch size	RealBatchSizeOut

Real batch size Real batah size it will adjust the batch size according to max_batch_size argument In this	case

Real batch size Real batah size it will adjust the batch size according to max_batch_size argument In this in	addition

Real batch size Real batah size it will adjust the batch size according to max_batch_size argument In this in if it has two it will record the input batch size and record it to the second output When we have	inputs

Real batch size Real batah size it will adjust the batch size according to max_batch_size argument In this in if it has two it will record the input batch size and record it to the second output When we have it expects the seocnd input contains the batch size to adjust	to

	X = in[0]

Feature map input with order NCHW or NHWC	bias

returns a tensor containing the indices of the largest element along the given axis If the keepdims arg is True	default

returns a tensor containing the indices of the largest element along the given axis If the keepdims arg is *True the shape of the output tensor matches the input tensor except the axis dimension equals	Else

returns a tensor containing the indices of the largest element along the given axis If the keepdims arg is *True the shape of the output tensor matches the input tensor except the axis dimension equals the axis dimension of the output tensor is removed Github	axis =2

int long or long long and checks if all values are True when coerced into a boolean In other	words

int long or long long and checks if all values are True when coerced into a boolean In other for non bool types this asserts that all values in the tensor are non zero If a value is False after coerced into a	boolean

and D is the feature_dim The indices is a tensor containing the indices of the features that need to be bucketized The lengths is a tensor that splits the following boundaries argument The boundaries is a tensor containing the border list for each feature With in each	batch

and D is the feature_dim The indices is a tensor containing the indices of the features that need to be bucketized The lengths is a tensor that splits the following boundaries argument The boundaries is a tensor containing the border list for each feature With in each indices should not have duplicate	number

and D is the feature_dim The indices is a tensor containing the indices of the features that need to be bucketized The lengths is a tensor that splits the following boundaries argument The boundaries is a tensor containing the border list for each feature With in each indices should not have duplicate and the number of elements in indices should be less than or euqal to D Each element in lengths the first sub border list	is [0.5, 1.0]

const vector< TensorShape > &	in

ArgumentHelper	helper (def)

const auto &	data_dims = GetDimsVector(in[0])

const auto &	indices_dims = GetDimsVector(in[1])

vector< int >	output_dims

	out [0] = CreateTensorShape(output_dims, TensorProto::FLOAT)

	DATA

Tensor of rank	r

indices	vector

indices and values vector Each element in lengths indices should not have duplicate number For	example

with the size where	num_feature

with the size where we also need additional information regarding the feature value distribution There are several vectors to keep data to percentile mappping information as arguments(context) the interpolation is apply	R_2 = [0.3, 1.2]

We will build	R = [0.1, 0.4, 0.5, 0.3, 1.2]

	besides

Tensor	__pad0__

Tensor	mask

Tensor	Tensor

same shape as data	masked_indices

const float	minf = -1.0f * std::numeric_limits<float>::infinity()

reconstruct values together according to masks A comprehensive	False

reconstruct values together according to masks A comprehensive	True

reconstruct values together according to masks A comprehensive False	values1 = 1.0

reconstruct values together according to masks A comprehensive False	mask2 = False

reconstruct values together according to masks A comprehensive False False	values2

reconstruct values together according to masks A comprehensive False False True	values3 = 4.0

reconstruct values together according to masks A comprehensive False False True Reconstruct	by

reconstruct values together according to masks A comprehensive False False True Reconstruct	mask3

reconstruct values together according to masks A comprehensive False False True Reconstruct Note that for all mask	positions

reconstruct values together according to masks A comprehensive False False True Reconstruct Note that for all mask there must be at least one True This is not	allowed

reconstruct values together according to masks A comprehensive False False True Reconstruct Note that for all mask there must be at least one True This is not False False we accept the first	value

reconstruct values together according to masks A comprehensive False False True Reconstruct Note that for all mask there must be at least one True This is not False False we accept the first and no longer expect a value for that	location

*	type

	Y

the gradient for the output of SpatialBN and the per channel mean and inverse std var vectors for the computes the per channel bias and scale gradient to be used during the backward pass for subsequent spatial batch normalization gradient calculation	Typically

the gradient for the output of SpatialBN and the per channel mean and inverse std var vectors for the computes the per channel bias and scale gradient to be used during the backward pass for subsequent spatial batch normalization gradient calculation the results of this op are subsequently reduced over multiple devices to obtain statistics over a larger batch size in cases where the batch size for a single model copy is too low to yield the full benefit of batch normalization The resulting bias and scale can then be plugged back into SpatialBNGradient to get results over the larger batch size DOC	mean =2.0

the gradient for the output of SpatialBN and the per channel mean and inverse std var vectors for the computes the per channel bias and scale gradient to be used during the backward pass for subsequent spatial batch normalization gradient calculation the results of this op are subsequently reduced over multiple devices to obtain statistics over a larger batch size in cases where the batch size for a single model copy is too low to yield the full benefit of batch normalization The resulting bias and scale can then be plugged back into SpatialBNGradient to get results over the larger batch size DOC The mean saved from the forward pass as a dimensional tensor of size C	output_grad

the gradient for the output of SpatialBN and the per channel mean and inverse std var vectors for the computes the per channel bias and scale gradient to be used during the backward pass for subsequent spatial batch normalization gradient calculation the results of this op are subsequently reduced over multiple devices to obtain statistics over a larger batch size in cases where the batch size for a single model copy is too low to yield the full benefit of batch normalization The resulting bias and scale can then be plugged back into SpatialBNGradient to get results over the larger batch size DOC The mean saved from the forward pass as a dimensional tensor of size C Gradient for the output layer of	SpatialBN

the gradient for the output of SpatialBN and the per channel mean and inverse std var vectors for the computes the per channel bias and scale gradient to be used during the backward pass for subsequent spatial batch normalization gradient calculation the results of this op are subsequently reduced over multiple devices to obtain statistics over a larger batch size in cases where the batch size for a single model copy is too low to yield the full benefit of batch normalization The resulting bias and scale can then be plugged back into SpatialBNGradient to get results over the larger batch size DOC The mean saved from the forward pass as a dimensional tensor of size C Gradient for the output layer of here used as input because we are on the backward pass	bias_grad

computes the sum of all elements per channel and the sum of all elements squared per channel These values can be reduced across multiple batches and used to obtain the mean and variance across the full set of batches Using the new mean and variance as input to SpatialBN has the effect of changing the batch size over which SpatialBN is applied DOC	sum

	kv_handler

Key value handler for	comm_world

Key value handler for A common world for collective operations int rank of this node in the common	world

	existing_comm_world

	common_world

The common world to be	destroyed

The common world The reduced result on	root

The common world The allreduced	tensor

The common world The allreduced same on all	nodes

The common world	dst

The common world An int CPUtensor of size specifying the rank If	given

The common world An int CPUtensor of size specifying the rank If this overrides the to argument of the op The rank to send the tensor to bool if	set

The common world An int CPUtensor of size specifying the rank If this overrides the to argument of the op The rank to send the tensor to bool if only send the content and assume that the receiver has already known the tensor s shape and	information

The common world	src

The common world An int CPUtensor of size specifying the rank If this overrides the from argument of the op The received tensor	tag

INT_MAX	split

INT_MAX	int

INT_MAX	legnths

INT_MAX The tensor l_i indicates the logic block of input Either NHWC or	NCWH

INT_MAX The tensor l_i indicates the logic block of input Either NHWC or will split on C defaults to NCHW given a lengths along the specified axis If K outputs are	provided

apply conditional	DataT

apply conditional Data to use when True	DataO

const char	kConvDoc []

an input weight tensor	$filter

an input weight tensor and optionally an input bias tensor $bias It then computes the transposed	convolution

an input weight tensor and optionally an input bias tensor $bias It then computes the transposed sometimes referred to as	deconvolution

an input weight tensor and optionally an input bias tensor $bias It then computes the transposed sometimes referred to as and produces a single output tensor $Y The hyperparameters of the op such as kernel	stride =2

an input weight tensor and optionally an input bias tensor $bias It then computes the transposed sometimes referred to as and produces a single output tensor $Y The hyperparameters of the op such as kernel and padding are specified as args At each the filter is deconvolved with a subset of $X and the $bias is added This is done throughout the input data until the output computation is complete The output shapes are computed as follows The number of channels in the output feature map is the number of kernels specified in the filter blob The spatial height and width are computed	as

an input weight tensor and optionally an input bias tensor $bias It then computes the transposed sometimes referred to as and produces a single output tensor $Y The hyperparameters of the op such as kernel and padding are specified as args At each the filter is deconvolved with a subset of $X and the $bias is added This is done throughout the input data until the output computation is complete The output shapes are computed as follows The number of channels in the output feature map is the number of kernels specified in the filter blob The spatial height and width are computed which is why they are separate files	Also

const char *	githubLinks

const char *	kCountExample

	counter

default must	be

	previous_count

Input tensor which is almost always the result of a softmax operation $X is a array of size	$NxD

R	logits = in[0]

R matrix of logits for each example and class	xentropy

matrix of logits for each example and class	weights

Maximum number of candidates to carry over to next activation step	INPUTS

Maximum number of candidates to carry over to next activation step float Tensor sized[max_activation_length, batch_size, alphabet_size] of network	SEQ_LEN

Maximum number of candidates to carry over to next activation step float Tensor sized[max_activation_length, batch_size, alphabet_size] of network optional int vector containing sequence	lengths

Maximum number of candidates to carry over to next activation step float Tensor sized[max_activation_length, batch_size, alphabet_size] of network optional int vector containing sequence having size[batch_size] seq_len will be set to max_time if not provided	VALUES

When merge_repeated is	true

When merge_repeated is merge repeated classes in output float Tensor sized[max_time, batch_size, num_classes]	OUTPUT_LEN

as well as can tie together different blobs in a data dependency	DOC

or input tensor	Z

Y with different shapes and produces one output float tensor of the dot product between X and Y We currently support two kinds of strategies to achieve this Before doing normal dot_product pad the smaller Y must be equal Only the second dimension of X or Y can be padded DOC or input tensor whether to replicate the smaller tensor or	not

INT_MAX Subnet with blob bindings Indices of corresponding outer workspace	blobs

INT_MAX Subnet with blob bindings Indices of corresponding outer workspace in	order

INT_MAX Subnet with blob bindings Indices of corresponding outer workspace in List of blobs from the forward Do int out	bool { return true

ArgumentHelper	argsHelper (def)

default	__pad1__

default perform dropout If non signifying which elements are dropped out If is_test is	nonzero

the op takes an input tensor $X of shape a weight vector $w of length	$D

the op takes an input tensor $X of shape a weight vector $w of length and a bias vector $b of length $D	Here

the op takes an input tensor $X of shape a weight vector $w of length and a bias vector $b of length $D $N represents the batch size and $D represents the length of the feature vectors The	$Y

the op takes an input tensor $X of shape a weight vector $w of length and a bias vector $b of length $D $N represents the batch size and $D represents the length of the feature vectors The is a tensor of shape $NxD and is calculated as	$$Y_ {ij} = X_{ij}w_j + b_j \ for \ i\in{N}

and the dimensions of the second input is the contiguous subset of the dimensions of the first For the following tensor shapes are	supported

	param

Parameters to be normalized	grad

Parameters to be normalized Gradient	computed

element wise This operation can be done in an in place fashion	too

element wise This operation can be done in an in place fashion by providing the same input and output blobs Github	Link

NumInputs(2).NumOutputs(1).SetDoc(R"DOC( Broadcast the input tensor to a materialized new tensor using given shape. Broadcast rule is similar to "numpy.array(input)*numpy.ones(shape)" Two corresponding dimensions must have the same or one of them equals to In order to align with PyTorch s	expand

NumInputs(2).NumOutputs(1).SetDoc(R"DOC( Broadcast the input tensor to a materialized new tensor using given shape. Broadcast rule is similar to "numpy.array(input)*numpy.ones(shape)" Two corresponding dimensions must have the same or one of them equals to In order to align with PyTorch s shape is allowed to have entries equal which means to preserve the size of the corresponding dimension in	shape

and produces a single output tensor expanded The op also takes an argument dims with a list of dimensions for where to add the single dimensional entries If the same blob is provided as input and the operation is copy free This is the exact inverse operation of Squeeze *Github	dims =[0,1]

auto	originalSize = dims.size()

std::vector< int >	newDims

the data types supported are float int32 int64 and bool If the dtype argument is not the data type of value is used The output tensor shape is either specified by the shape argument or will match the shape of the input tensor if one is the input should be a tensor containing the desired output use the integer keys from the DataType *enum in	TensorProto

	FLOAT = 1

	INT32 = 2

	BYTE = 3

	STRING = 4

	BOOL = 5

	UINT8 = 6

	INT8 = 7

	UINT16 = 8

	INT16 = 9

	INT64 = 10

	FLOAT16 = 12

	DOUBLE = 13

shape input must be in CPU context	min

max The range can be defined either by arguments or input blobs min and max are inclusive If the range is given by input you also need to give the shape as input When the range is given as	arguments

its elements will be excluded from uniform sampling Using the second input will require you to provide shape via the first input DOC Maximum inclusive The shape of the output tensor Cannot set the shape argument and pass in an input at the same time tensor containing the desired output shape First input must be in CPU context	avoid

if input_as_shape is set to true then the input should be a tensor containing the desired output the shape argument should not be set	Note

	index

	query

	d_1

	scale_bias_quantized_input

auto	bitwidth = helper.GetSingleArgument<int32_t>("bitwidth", 8)

size_t	data_per_byte = 8 / bitwidth

	quantized_input

Fused	tail

but operating on bit rowwise quantized matrices with fused uint8 tensor with rank obtained with	OUTPUT

bounding box regression result deltas as well as predefined bounding box shapes anchors Greedy non maximum suppression is applied to generate the final bounding boxes DOC int RPN_PRE_NMS_TOP_N float RPN_NMS_THRESH for rotated	boxes

bounding box regression result deltas as well as predefined bounding box shapes anchors Greedy non maximum suppression is applied to generate the final bounding boxes DOC int RPN_PRE_NMS_TOP_N float RPN_NMS_THRESH for rotated angle is normalized to be within[angle_bound_lo, angle_bound_hi] for rotated angle is normalized to be within[angle_bound_lo, angle_bound_hi]	scores

bounding box regression result deltas as well as predefined bounding box shapes anchors Greedy non maximum suppression is applied to generate the final bounding boxes DOC int RPN_PRE_NMS_TOP_N float RPN_NMS_THRESH for rotated angle is normalized to be within[angle_bound_lo, angle_bound_hi] for rotated angle is normalized to be within[angle_bound_lo, angle_bound_hi] Scores from conv	layer

bounding box regression result deltas as well as predefined bounding box shapes anchors Greedy non maximum suppression is applied to generate the final bounding boxes DOC int RPN_PRE_NMS_TOP_N float RPN_NMS_THRESH for rotated angle is normalized to be within[angle_bound_lo, angle_bound_hi] for rotated angle is normalized to be within[angle_bound_lo, angle_bound_hi] Scores from conv	bbox_deltas

bounding box regression result deltas as well as predefined bounding box shapes anchors Greedy non maximum suppression is applied to generate the final bounding boxes DOC int RPN_PRE_NMS_TOP_N float RPN_NMS_THRESH for rotated angle is normalized to be within[angle_bound_lo, angle_bound_hi] for rotated angle is normalized to be within[angle_bound_lo, angle_bound_hi] Scores from conv Bounding box deltas from conv	im_info

bounding box regression result deltas as well as predefined bounding box shapes anchors Greedy non maximum suppression is applied to generate the final bounding boxes DOC int RPN_PRE_NMS_TOP_N float RPN_NMS_THRESH for rotated angle is normalized to be within[angle_bound_lo, angle_bound_hi] for rotated angle is normalized to be within[angle_bound_lo, angle_bound_hi] Scores from conv Bounding box deltas from conv Image	info

bounding box regression result deltas as well as predefined bounding box shapes anchors Greedy non maximum suppression is applied to generate the final bounding boxes DOC int RPN_PRE_NMS_TOP_N float RPN_NMS_THRESH for rotated angle is normalized to be within[angle_bound_lo, angle_bound_hi] for rotated angle is normalized to be within[angle_bound_lo, angle_bound_hi] Scores from conv Bounding box deltas from conv Image	anchors

bounding box regression result deltas as well as predefined bounding box shapes anchors Greedy non maximum suppression is applied to generate the final bounding boxes DOC int RPN_PRE_NMS_TOP_N float RPN_NMS_THRESH for rotated angle is normalized to be within[angle_bound_lo, angle_bound_hi] for rotated angle is normalized to be within[angle_bound_lo, angle_bound_hi] Scores from conv Bounding box deltas from conv Image Bounding box	rois

bounding box regression result deltas as well as predefined bounding box shapes anchors Greedy non maximum suppression is applied to generate the final bounding boxes DOC int RPN_PRE_NMS_TOP_N float RPN_NMS_THRESH for rotated angle is normalized to be within[angle_bound_lo, angle_bound_hi] for rotated angle is normalized to be within[angle_bound_lo, angle_bound_hi] Scores from conv Bounding box deltas from conv Image Bounding box	Proposals

bounding box regression result deltas as well as predefined bounding box shapes anchors Greedy non maximum suppression is applied to generate the final bounding boxes DOC int RPN_PRE_NMS_TOP_N float RPN_NMS_THRESH for rotated angle is normalized to be within[angle_bound_lo, angle_bound_hi] for rotated angle is normalized to be within[angle_bound_lo, angle_bound_hi] Scores from conv Bounding box deltas from conv Image Bounding box	rois_probs

bounding box regression result deltas as well as predefined bounding box shapes anchors Greedy non maximum suppression is applied to generate the final bounding boxes DOC int RPN_PRE_NMS_TOP_N float RPN_NMS_THRESH for rotated angle is normalized to be within[angle_bound_lo, angle_bound_hi] for rotated angle is normalized to be within[angle_bound_lo, angle_bound_hi] Scores from conv Bounding box deltas from conv Image Bounding box scores of	proposals

*type depends on	dtype

in a sequence length aware fashion	Concretely

in a sequence length aware fashion given the previous hidden and the sequence computes the GRU	activations

in a sequence length aware fashion given the previous hidden and the sequence computes the GRU avoiding computation if the input is Bool to determine if hidden state is zeroes or passed along for timesteps past the given sequence_length	hidden

When	false

When the sequence lengths input is left and all following inputs are shifted left by	one

INT_MAX Net executed when condition is true	condition

auto	pad = helper.GetSingleArgument<int>("pad", 0)

auto	kernel_h

auto	kernel_w

auto	dilation_h

auto	dilation_w

auto	stride_h

auto	stride_w

int	N = 0

int	C = 0

int	H = 0

int	W = 0

const int	dkernel_h = dilation_h * (kernel_h - 1) + 1

const int	dkernel_w = dilation_w * (kernel_w - 1) + 1

const int	out_h = (H + 2 * pad - dkernel_h) / stride_h + 1

const int	out_w = (W + 2 * pad - dkernel_w) / stride_w + 1

Max number of	elements

return an Int tensor of same shape containing the indices for each of the keys If the index is	frozen

return an Int tensor of same shape containing the indices for each of the keys If the index is unknown entries are given index	Otherwise

return an Int tensor of same shape containing the indices for each of the keys If the index is unknown entries are given index new entries are added into the index If an insert is necessary but max_elements has been	reached

return an Int tensor of same shape containing the indices for each of the keys If the index is unknown entries are given index new entries are added into the index If an insert is necessary but max_elements has been fail DOC	keys

return an Int tensor of same shape containing the indices for each of the keys If the index is unknown entries are given index new entries are added into the index If an insert is necessary but max_elements has been fail DOC Tensor of keys to be looked up Output(0,"indices","Indices for each of the keys.").ScalarType(TensorProto disallowing creation of new index entries Should not be called concurrently with IndexGet DOC	handle

Pointer to an Index instance The input handle If skips the first entry of the tensor This allows to load tensors that are aligned with an	embedding

the first element of the output tensor will be element of index DOC	items

array of probabilities for prediction	L

array of probabilities for prediction array of JSD	losses

	INT_MAX

auto	input_dims_long = GetDimsVector(in[0])

const auto	canonical_axis

default and produces one output tensor $Y of the same shape as $X The op performs the element wise leaky relu	operation

default and produces one output tensor $Y of the same shape as $X The op performs the element wise leaky relu defined as	$$y

default and produces one output tensor $Y of the same shape as $X The op performs the element wise leaky relu defined as calculated as described	above

and LENGTHS tensor of	rank

and LENGTHS tensor of pad each segment in DATA with so that each segment s length is target_length If will	throw

and LENGTHS tensor of pad each segment in DATA with so that each segment s length is target_length If will if there is segment of length larger than target_length	Example

and LENGTHS tensor of pad each segment in DATA with so that each segment s length is target_length If will if there is segment of length larger than target_length	LENGTHS

	where

for each	row

for each weights are accessed by	indices [0..L-1]

for each weights are accessed by where L is the length of given row This is basically a fused	WEIGHT

NumInputs(4).NumOutputs(1).ValueLengthInputFillers(SparseLengths8BitsRowwiseOp< CPUContext > SparseLengths8BitsRowwiseOp< CPUContext, 1 >::LENGTHS uint8 tensor obtained with	INDICES

NumInputs(4).NumOutputs(1).ValueLengthInputFillers(SparseLengths8BitsRowwiseOp< CPUContext > SparseLengths8BitsRowwiseOp< CPUContext, 1 >::LENGTHS uint8 tensor obtained with Integer vector containing indices of the first dimension of DATA for the slices that are being aggregated	scale_bias

NumInputs(4).NumOutputs(1).ValueLengthInputFillers(SparseLengths8BitsRowwiseOp< CPUContext > SparseLengths8BitsRowwiseOp< CPUContext, 1 >::LENGTHS uint8 tensor obtained with Integer vector containing indices of the first dimension of DATA for the slices that are being aggregated Matrix of	floats

NumInputs(4).NumOutputs(1).ValueLengthInputFillers(SparseLengths8BitsRowwiseOp< CPUContext > SparseLengths8BitsRowwiseOp< CPUContext, 1 >::LENGTHS uint8 tensor obtained with Integer vector containing indices of the first dimension of DATA for the slices that are being aggregated Matrix of each row r_i of which stores a pair	s_i

SparseLengths8BitsRowwiseOp< CPUContext, 0, 1 >::LENGTHS	SetDoc (R"DOC( Variation of SparseLengthsMean operator, where DATA is stored using 8bits. DATA was quantized with 8Bit row-wise quantization (see doc to FloatToRowwiseQuantized8Bits operator). To restore DATA from 8Bit, we use additional input that stores scales and biases. )DOC").Input(0

SparseLengths8BitsRowwiseOp< CPUContext, 0, 1 >::LENGTHS uint8 tensor obtained with	operator FloatToRowwiseQuantized8Bits") .Input (1,"INDICES","Integer vector containing indices of the first ""dimension of DATA for the slices that are being aggregated").Input(2

SparseLengths8BitsRowwiseOp< CPUContext, 0, 1 >::LENGTHS uint8 tensor obtained with Vector with the same sum of elements as the first dimension of DATA	Input (3,"scale_bias","Matrix of floats, each row r_i of which stores a pair ""s_i, b_i -- scale and bias for i-th row").Output(0

SparseLengths8BitsRowwiseOp< CPUContext, 1, 1 >::LENGTHS uint8 tensor obtained with Integer vector containing indices of the first dimension of DATA for the slices that are being aggregated Matrix of each row r_i of which stores a pair b_i scale and bias for i th row	Output (0,"output","output")

where segments are defined by their and concatenate them in an output tensor of the output value will be padded and the corresponding output indices will be padded by DOC Tensor of int32 lengths of rank	TopKIndices

default save the db directly to the path specified by the db arg If not prepend the path of the current root folder of the workspace to the path specified by the db arg characters that precede strip_prefix will be removed Useful for removing device scope from blob names	leveldb

default	__pad2__

but allows one to save to db every few	iterations

default	__pad3__

default	__pad4__

default	Y_scale

stride	sizes

stride pad lengths and dilation $L_p pooling consists of taking the $L_p norm of a subset of the input tensor according to the kernel size and downsampling the data into the output blob for further processing Pooling layers reduce the spatial dimensionality of the input blob Each of the output blob s dimensions will reduce according	kernel =2

stride pad lengths and dilation $L_p pooling consists of taking the $L_p norm of a subset of the input tensor according to the kernel size and downsampling the data into the output blob for further processing Pooling layers reduce the spatial dimensionality of the input blob Each of the output blob s dimensions will reduce according	p

and label is applied to the tensor elementwise If	y

ArgumentHelper	arg_helper (def)

int	axis_a = arg_helper.GetSingleArgument<int>("axis_a", 1)

int	axis_b = arg_helper.GetSingleArgument<int>("axis_b", 1)

int	trans_a = arg_helper.GetSingleArgument<bool>("trans_a", false)

int	trans_b = arg_helper.GetSingleArgument<bool>("trans_b", false)

int	canonical_axis_a = canonical_axis_index_(axis_a, in[0].dims().size())

int	canonical_axis_b = canonical_axis_index_(axis_b, in[0].dims().size())

int	M = size_to_dim_(canonical_axis_a, GetDimsVector(in[0]))

	A

	B

then the resulted tensor have the reduced dimension pruned DOC Keep the reduced default True keeps the reduced An input tensor	variance

	prediction

D float i	e

D float i batch size D is number of possible classes labels	accuracies

axis to	normalize

The input tensor	tiled_data

bucketize it based on the boundary values and then do one hot encoding The lengths specifies the number of boundary values for each column The final number of buckets is this number plus This would also be the expanded feature size boundaries specifies all the boundary values Note that each bucket is right inclusive That given boundary	values [b1, b2, b3]

INT_MAX Net executed on each iteration Whether to use the condition input Do not create new scopes Use this only if you re certain there will be no name	collision

INT_MAX Net executed on each iteration Whether to use the condition input Do not create new scopes Use this only if you re certain there will be no name for example if you re converting from a fully SSA IR	max_trip_count

d int long tensor contains the length in each of the output	packed_tensor

d int long tensor contains the length in each of the output N dim Tensor where	presence_mask

d int long tensor contains the length in each of the output N dim Tensor where dim boolean false where packed_tensor is	padded

d int long tensor contains the length in each of the output N dim Tensor where dim boolean false where packed_tensor is true otherwise Padding number in the packed segments Use true to pad	infinity

CPUContext::PadTensorInference Input data tensor from the previous	operator

dimensions depend on whether the NCHW or NHWC operators are being used For in the	former

dimensions depend on whether the NCHW or NHWC operators are being used For in the the input has where N is the batch C is the number of	channels

given a sample set of raw labeled with their corresponding percentiles from the same distribution In	particular

given a sample set of raw labeled with their corresponding percentiles from the same distribution In this	value_to_pct

given a sample set of raw labeled with their corresponding percentiles from the same distribution In this Sorted with columns Each element in the first column is a float representing the raw value of a sample Its corresponding element in the next column represents the percentile it maps to	percentile_values

given a sample set of raw labeled with their corresponding percentiles from the same distribution In this Sorted with columns Each element in the first column is a float representing the raw value of a sample Its corresponding element in the next column represents the percentile it maps to tensor of with the same dimensions as the flattened input tensor Each element of this corresponds to the percentile calculated for	original_values [i]

	probabilities

a D or D slopes and intercepts The output tensor has the same shape of input predictions and contains the predictions transformed by the piecewise linear functions Each column of predictions has its own piecewise linear transformation functions Therefore the size of piecewise function parameters are pieces x	prediction_dimensions

a D or D slopes and intercepts The output tensor has the same shape of input predictions and contains the predictions transformed by the piecewise linear functions Each column of predictions has its own piecewise linear transformation functions Therefore the size of piecewise function parameters are pieces x except for binary predictions where only the positive prediction needs them Note that in each	piece

a D or D slopes and intercepts The output tensor has the same shape of input predictions and contains the predictions transformed by the piecewise linear functions Each column of predictions has its own piecewise linear transformation functions Therefore the size of piecewise function parameters are pieces x except for binary predictions where only the positive prediction needs them Note that in each low bound is excluded while high bound is included Also the piecewise linear function must be continuous Notes If the input is binary set the binary arg to true so that one group of piecewise linear functions is	slopes

a D or D slopes and intercepts The output tensor has the same shape of input predictions and contains the predictions transformed by the piecewise linear functions Each column of predictions has its own piecewise linear transformation functions Therefore the size of piecewise function parameters are pieces x except for binary predictions where only the positive prediction needs them Note that in each low bound is excluded while high bound is included Also the piecewise linear function must be continuous Notes If the input is binary set the binary arg to true so that one group of piecewise linear functions is intercepts can be passed either through args or through input blobs If we have multiple groups of piecewise linear	functions

a D or D slopes and intercepts The output tensor has the same shape of input predictions and contains the predictions transformed by the piecewise linear functions Each column of predictions has its own piecewise linear transformation functions Therefore the size of piecewise function parameters are pieces x except for binary predictions where only the positive prediction needs them Note that in each low bound is excluded while high bound is included Also the piecewise linear function must be continuous Notes If the input is binary set the binary arg to true so that one group of piecewise linear functions is intercepts can be passed either through args or through input blobs If we have multiple groups of piecewise linear each group has the same number of pieces If a prediction is out of the	bounds

constexpr char	kAveragePoolDoc []

constexpr char	kMaxPoolDoc []

an input slope tensor	$slope

Size of the dimension to prepend	reshaped

Output tensor quantization offset First	operand

Output tensor quantization offset First should share the type with the second operand	Result

const char	kAveragePoolDoc_int8 []

Output tensor quantization offset Pass to add the axis specified in arg axis to all input tensors	concat_result

const char	kConvDoc_int8 []

Output tensor quantization scale the filter	blob

Output tensor quantization scale the filter and the bias and computes the output Note that other	parameters

Output tensor quantization scale the filter and the bias and computes the output Note that other such as the stride and kernel or the pads sizes in each direction are not necessary for input because they are provided by the ConvTransposeUnpoolOpBase	operator.Various dimension checks are done implicitly, and the sizes are specified in the Input docs for this operator.As is expected, the filter is deconvolved with a subset of the image and the bias is added

this is done throughout the image data and the output is computed As a side note on the implementation	layout

this is done throughout the image data and the output is computed As a side note on the implementation which is why they are separate files DOC	filter

	qX

Coefficient of	leakage

const char	kMaxPoolDoc_int8 []

New shape Output tensor quantization offset	new_shape

New shape Output tensor quantization offset New shape	old_shape

Spatial scale of the input feature map X relative to the input image E	g

Pooled output Y s width Int8 Tensor feature map input of	RoIs

	rather

it will be coerced into one For an arbitrary n dimensional tensor X in[a_0, a_1,..., a_{k-1}, a_k,..., a_{n-1}] and k is the axis then X will be coerced into a dimensional tensor with dimensions[a_0 ...a_{k-1}, a_k ...a_{n-1}] For the default case where this means the X tensor will be coerced into a tensor of	dimensions [a_0, a_1 ...a_{n-1}] =1

it will be coerced into one For an arbitrary n dimensional tensor X in[a_0, a_1,..., a_{k-1}, a_k,..., a_{n-1}] and k is the axis then X will be coerced into a dimensional tensor with dimensions[a_0 ...a_{k-1}, a_k ...a_{n-1}] For the default case where this means the X tensor will be coerced into a tensor of where a_0 is often the batch size In this	situation

it will be coerced into one For an arbitrary n dimensional tensor X in[a_0, a_1,..., a_{k-1}, a_k,..., a_{n-1}] and k is the axis then X will be coerced into a dimensional tensor with dimensions[a_0 ...a_{k-1}, a_k ...a_{n-1}] For the default case where this means the X tensor will be coerced into a tensor of where a_0 is often the batch size In this we must have	a_0

int can be	passed

dimension For example if	$X = [[1,5,2,9],[4,1,8,2],[2,7,0,3]]$ and $lengths = [2,3,1,2]$

then the axes dimensions are pruned Github	axes =(0,1)

of shape	$BxMxN

of shape where $B is the batch $M is number of	rows

of shape where $B is the batch $M is number of and $N is number of columns The output of this is a matrix of shape	$BxM

the value to replace	NaN

auto	actualNewShape = helper.GetRepeatedArgument<int64_t>("shape")

int64_t	totalSize = 1

int	unknownIdx = -1

	else

	segments

	embeddings

Prefix string to prepend extracted blobs	blob_names

Prefix string to prepend extracted blobs tensor of strings containing extracted blob	names

the implementation takes an the hidden state the cell and a weight	TxNxD

the implementation takes an the hidden state the cell and a weight the final hidden cell	bidirectional

the implementation takes an the hidden state the cell and a weight the final hidden cell	num_layers

the implementation takes an the hidden state the cell and a weight the final hidden cell	rnn_mode

R recurrent or input R	all_params

See RoIPoolF	dY

See RoIPoolF Gradient of forward	dX

there are multiple output	cases

	float

default the scale to	apply

an argument	$alpha

an argument an argument	$scale

affects the activation function itself This should go with the weight initialization in the paper See	https

will use same as padding_width	start_padding

will use same as padding_width D_n	data_out

will use same as padding_width D_n	D_1

will use same as padding_width considers all data as a single segment	lengths_out

Outer size of padding present around each range	data_in

Outer size of padding present around each range T< N, D1..., Dn > Padded input data	padding_sum

Outer size of padding present around each range T< N, D1..., Dn > Padded input data Sum of all start	paddings

auto	starts = helper.GetRepeatedArgument<int>("starts", vector<int>())

auto	ends = helper.GetRepeatedArgument<int>("ends", vector<int>())

return	vector< TensorShape >

it will be coerced into one For an arbitrary n dimensional tensor X where k is the axis then X will be coerced into a dimensional tensor with dimensions[(a_0 ...a_{k-1}),(a_k ...a_{n-1})] For the default case where the X tensor will be coerced into a tensor of where $a_0 is often the batch size In this we must hav	$a_0 )

auto	labels = in[1]

const int	batch_size

const int	num_classes

default	weight_tensor

default	softmax

default	loss

where the softplus	function

where the softplus	$y = ln(e^x + 1)$

this op outputs a copy of the input tensor where values from the height and width dimensions are moved to the batch dimension After the zero padding is according to the pad	argument

this op outputs a copy of the input tensor where values from the height and width dimensions are moved to the batch dimension After the zero padding is according to the pad both height and width of the input must be divisible by the block_size Only NCHW order is currently supported Github	block_size

followed by cropping This is the reverse transformation of SpaceToBatch More	specifically

Parameters to be normalized Gradient computed A bool variable to control whether to use max norm or constant norm When	use_max_norm = false

Parameters to be normalized Gradient computed A bool variable to control whether to use max norm or constant norm When constant norm is used so that all the embedding vectors are scaled to have a L2 norm equals to max norm is used so that embedding is scaled so that its l2 norm is no larger than A If an embedding s norm is less than A	originally

bool	is_test = helper.GetSingleArgument<int>(OpSchema::Arg_IsTest, 0)

default where $N is batch $C is number of $H is spatial	height

default The input dimensional tensor of shape $NCHW or $NHWC depending on the order parameter The bias as a dimensional tensor of size $C to be applied to the output	var

default The input dimensional tensor of shape $NCHW or $NHWC depending on the order parameter The bias as a dimensional tensor of size $C to be applied to the output The running	sums

default The input dimensional tensor of shape $NCHW or $NHWC depending on the order parameter The bias as a dimensional tensor of size $C to be applied to the output The running optional Per channel sums of elements to be used to determine the mean and variance for this batch The output dimensional tensor of the same shape as $X The running variance after the spatial BN	saved_var

Unscaled log probabilities Optional blob to be used to weight the samples for the loss With spatial weighting is by	x

A Blob pointing to the newly created	StatRegistry

If export values from given StatRegistry export values from the global singleton StatRegistry int64 tensor with exported values default true Whether to atomically reset the counters	afterwards

returning a scalar tensor containing a pointer to it The timer is stopped by calling TimerEnd Github	str

stops the timer publishing a CAFFE_EVENT Github	timerget_op

tensor of float	Index_High

default flag to indicate if the summarized statistics have to be written to a log file D	max

default flag to indicate if the summarized statistics have to be written to a log file D mean and standard	deviation

constexpr char	kSummaryzeOpExtension [] = ".summary"

const std::int32_t	tiles

	a_2

	a_n

r and integer argument	k

auto	valid_axes

tensor of int32 or int64 indices	remapping

	time

The time in	nanoseconds

bool	if

bool saves contents to the root folder of the current	workspace

const char	kPrintFileExtension [] = ".log"

const ArgumentHelper	args (def)

	sampling_cdf

An optional D Tensor Input cumulative sampling all values in sampling_cdf will be scaled by this number	sampled_indexes

an index is randomly sampled from the distribution given by the weights of the corresponding batch The output is a D	sampling_weights

an index is randomly sampled from the distribution given by the weights of the corresponding batch The output is a D A D Tensor of	sampling_values

an index is randomly sampled from the distribution given by the weights of the corresponding batch The output is a D A D Tensor of An optional D Tensor of The output tensor contains	sampled_values

decltype(adagrad_update__base)	adagrad_update__avx_f16c

decltype(adagrad_update_prefetch__base)	adagrad_update_prefetch__avx_f16c

decltype(adagrad_fp16_update_prefetch__base)	adagrad_fp16_update_prefetch__avx_f16c

decltype(rowwise_adagrad_update__base)	rowwise_adagrad_update__avx_f16c

decltype(sparse_adagrad_int32_t__base)	sparse_adagrad_int32_t__avx_f16c

decltype(sparse_adagrad_int64_t__base)	sparse_adagrad_int64_t__avx_f16c

decltype(TypedAxpyHalffloat__base)	TypedAxpyHalffloat__avx2_fma

decltype(TypedAxpyHalffloat__base)	TypedAxpyHalffloat__avx_f16c

decltype(TypedAxpy_uint8_float__base)	TypedAxpy_uint8_float__avx2_fma

decltype(TypedAxpy_uint8_float__base)	TypedAxpy_uint8_float__avx_f16c

constexpr DeviceType	CPU = DeviceType::CPU

constexpr DeviceType	CUDA = DeviceType::CUDA

constexpr DeviceType	OPENGL = DeviceType::OPENGL

constexpr DeviceType	OPENCL = DeviceType::OPENCL

constexpr DeviceType	MKLDNN = DeviceType::MKLDNN

constexpr DeviceType	IDEEP = DeviceType::IDEEP

constexpr DeviceType	HIP = DeviceType::HIP

constexpr DeviceType	COMPILE_TIME_MAX_DEVICE_TYPES

constexpr DeviceType	ONLY_FOR_TEST = DeviceType::ONLY_FOR_TEST

	data_0

Weight tensor in KRSC layout	W_q

Weight tensor in KRSC layout Weight bias tensor in a packed	format

constexpr int	nlines_log = 10000

Timeout in	secs

Timeout in	queue

The shared pointer for the	BlobsQueue

the output status will be set to true which can be used as exit criteria for execution step The input is the queue and the last output is the status The rest are data blobs DOC The shared pointer for the BlobsQueue	status

the output status will be set to true which can be used as exit criteria for execution step The input is the queue and the last output is the status The rest are data blobs DOC The shared pointer for the BlobsQueue Is set to depending on the success of	dequeue

Parameters to be updated	moment_delta

Parameters to be updated Average of squared parameter updates	lr

Parameters to be updated Average of squared parameter updates Learning rate	output_moment

	moment

runs the dense AdaDelta update and Parameters to be updated Average of squared parameter updates Gradient computed	output_param

runs the dense AdaDelta update and Parameters to be updated Average of squared parameter updates Gradient computed Updated parameters	output_moment_delta

runs the dense AdaDelta update and Parameters to be updated Average of squared parameter updates Gradient computed Updated parameters Updated average of squared parameter updates	Default

runs the dense AdaDelta update and Parameters to be updated Average of squared parameter updates Gradient computed Updated parameters Updated average of squared parameter updates the squared gradient sum is decayed by this	factor

Parameters to be updated	moment_2

Parameters to be updated Second moment history learning rate Updated parameters	output_moment_2

	moment1

	moment2

	iter

runs the dense Adam	new_moment1

runs the dense Adam new_moment2 as in dense case DOC	moment_1

runs the Adam update	new_moment2

	input_tensor

Tensor of floats to be clipped	additional_threshold

Tensor of floats to be clipped An optional additonal threshold to scale the orignal threshold	clipped

	mutex

given a parameter tensor X and its gradient the local learning rate for X will be	local_lr

given a parameter tensor X and its gradient the local learning rate for X will be where offset is a preset hyper parameter to avoid numerical issue and trust indicates how much we trust the layer to change its parameters during one update In this	implementation

given a parameter tensor X and its gradient the local learning rate for X will be where offset is a preset hyper parameter to avoid numerical issue and trust indicates how much we trust the layer to change its parameters during one update In this we uses l2 norm and the computed local learning rate is clipped based on the upper bound lr_max and the lower bound	lr_min

the learning rate for performing gradient descent on learning rate lr Learning rate	effgrad

given	nesterov

given	computes

given	adjusted_gradient

given param	momentum

given param parameter Note the difference to	MomentumSGD

GradientSlice with gradients for updated indices Learning rate Adjusted gradient Updated parameter boolean Whether to use Nesterov Accelerated	Gradient

Parameters to be updated Learning rate	curv_win

Parameters to be updated Learning rate Memory for latest curvature ranges	g2_avg

Parameters to be updated Learning rate Memory for latest curvature ranges Moving average of squared gradient Gradient computed Parameters to be updated	output_lr

Parameters to be updated Learning rate Memory for latest curvature ranges Moving average of squared gradient Gradient computed Parameters to be updated Output learning rate	output_curv_win

Parameters to be updated Learning rate Memory for latest curvature ranges Moving average of squared gradient Gradient computed Parameters to be updated Output learning rate Output memory for latest curvature ranges	output_g2_avg

INT_MAX	compressed

constexpr size_t	kDefaultMinWorkSize = 1

constexpr size_t	kCacheLineSize = 64

constexpr size_t	kGEMMLOWPCacheLineSize = 64

const int	kMaxBusyWaitNOPs = 32 * 1000 * 1000

where N is the number of elements in the H and W are the height and	width

where N is the number of elements in the H and W are the height and and each of length num_classes The softmax is applied to each group independently	See

number of classes in each softmax group tensor of softmax probabilities with where and softmax was applied to each of the num_anchors	groups

See GroupSpatialSoftmax	d_scores

L2 to L1 transition point	Y_hat

L2 to L1 transition point tensor of bounding box regression predictions with tensor of labels	locations

See SelectSmoothL1Loss See SelectSmoothL1Loss	d_loss

See SelectSmoothL1Loss See SelectSmoothL1Loss Gradient of forward	d_Y_hat

where indicates that the corresponding sample should be ignored	and

multiply the loss by this scale factor Tensor of predicted	targets

where N is the number of elements in the H and W are the height and and each of length num_classes For the binary form of Focal	Loss

See SigmoidFocalLoss	normalizer

See SigmoidFocalLoss See SigmoidFocalLoss	d_logits

NumInputs(4).NumOutputs(1).SetDoc(R"DOC( Smooth L1 Loss is a minor variation of Huber loss in which the point of transition between L2 loss and L1 loss is adjustable by a hyper-parameter beta L2 to L1 transition point Tensor of Tensor of labels with the same shape as Y_hat	alpha_out

See SmoothL1Loss	alpha_in

where N is the number of elements in the H and W are the height and and where	p_i = exp(s_i) / sum_j exp(s_j)

or input of	H0

See SpatialNarrowAs	dC

See SpatialNarrowAs Gradient of forward	dA

Detailed Description

A global dictionary that holds information about what Caffe2 modules have been loaded in the current runtime, and also utility functions to load modules.

Author: Tudor Bosman (tudor.nosp@m.b@fb.nosp@m..com)

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

Function Documentation

CAFFE2_API void caffe2::addBlobDeviceOptions	(	std::map< std::string, caffe2::DeviceOption >	blobMap,
		nom::repr::NNModule *	nn
	)

Helpers for the convertToNNModule for use if you already have an NNModule.

You probably don't want to use these if you can use convertToNNModule instead.

Definition at line 16 of file distributed.cc.

CAFFE2_CUDA_API int caffe2::CaffeCudaGetDevice ( )

Gets the current GPU id.

This is a simple wrapper around cudaGetDevice().

Definition at line 96 of file common_gpu.cc.

CAFFE2_CUDA_API void caffe2::CaffeCudaSetDevice ( const int id )

Gets the current GPU id.

This is a simple wrapper around cudaGetDevice().

Definition at line 102 of file common_gpu.cc.

CAFFE2_API nom::repr::NNModule caffe2::convertToNNModule	(	const caffe2::NetDef &	net,
		bool	strict,
		std::vector< repr::NNGraph::NodeRef > *	opNodeVec
	)

Ingest a caffe2 protobuf model and output an NNModule.

Parameters

net	The caffe2 protobuf NetDef

We keep track of the producer of the blob. Because Caffe2 Nets are really just ordered operations we can just keep track of the most recent producer of a blob and draw and edge from that to any consumer we come by. If a new operator produces the blob we simply replace it in this map.

For the construction of the control flow graph we keep track of a current basic block, which we split up as we come accross control flow operations such as if and while.

Definition at line 301 of file converter.cc.

CAFFE2_API nom::repr::NNModule caffe2::convertToNNModule	(	caffe2::NetDef &	,
		std::map< std::string, caffe2::DeviceOption >
	)

Convert to an NNModule and apply a mapping of tensor names to DeviceOptions to it.

This only applies the map to Declare/Export nodes, which are representationally equivalent to external_input/external_output in caffe2 NetDefs.

Throws an exception if the passed in blobMap contains blobs that are not present in the NNModule.

Definition at line 103 of file distributed.cc.

repr::NNModule caffe2::convertToNNModule	(	const caffe2::NetDef &	net,
		bool	strict,
		std::vector< repr::NNGraph::NodeRef > *	opNodeVec
	)

Ingest a caffe2 protobuf model and output an NNModule.

Parameters

net	The caffe2 protobuf NetDef

We keep track of the producer of the blob. Because Caffe2 Nets are really just ordered operations we can just keep track of the most recent producer of a blob and draw and edge from that to any consumer we come by. If a new operator produces the blob we simply replace it in this map.

For the construction of the control flow graph we keep track of a current basic block, which we split up as we come accross control flow operations such as if and while.

Definition at line 301 of file converter.cc.

CAFFE2_API unique_ptr< NetBase > caffe2::CreateNet	(	const NetDef &	net_def,
		Workspace *	ws
	)

Creates a network, accessing / creating blobs in the given workspace.

Note that this is different from Workspace::CreateNet. The latter adds the created net object to the workspace's net map, while this function returns a standalone net object.

Definition at line 151 of file net.cc.

template<>

std::unique_ptr<RecurrentNetworkExecutorBase> caffe2::createRNNExecutor< CPUContext >	(	const NetDef &	step_net_def,
		std::map< string, string > &	recurrent_input_map,
		std::string	timestep_blob,
		ArgumentHelper	rnn_args
	)

Implementation of RecurrentNetworkExecutor that uses thread pool for multithreaded execution of RNNs.

Used with CPU.

Definition at line 13 of file recurrent_network_executor.cc.

template<typename Context >

void caffe2::createSharedBuffer ( Workspace * ws )

Creates a mutex and shared buffer in the workspace.

Not thread-safe, must be called from the constructor.

CAFFE2_API const CaffeMap< string, const ModuleSchema * > & caffe2::CurrentModules ( )

Current Modules present in the Caffe2 runtime.

Returns: map: a map of modules and (optionally) their description. The key is the module name, and the value is the description for that module. The module name is recommended to be the part that constitutes the trunk of the dynamic library: for example, a module called libcaffe2_db_rocksdb.so should have the name "caffe2_db_rocksdb". The reason we do not use "lib" is because it's somewhat redundant, and the reason we do not include ".so" is for cross-platform compatibility on platforms like mac os.

Definition at line 27 of file module.cc.

CAFFE2_API void caffe2::DeserializeBlob	(	const string &	content,
		Blob *	result
	)

Deserializes from a string containing either BlobProto or TensorProto.

If the deserialization fails, the content in the blob should no longer be trusted.

Definition at line 362 of file blob_serialization.cc.

template<typename IndexType , typename InType , typename OutType , bool IS_WEIGHT_POSITIONAL = false>

void caffe2::EmbeddingLookup	(	const std::int64_t	block_size,
		const std::int64_t	output_size,
		const std::int64_t	index_size,
		const std::int64_t	data_size,
		const InType *	input,
		const IndexType *	indices,
		const int *	lengths,
		const float *	weights,
		const float *	scale_bias,
		bool	normalize_by_lengths,
		OutType *	out
	)

Embedding lookup with reduction.

input of size data_size * block_size indices of size index_size lengths of size output_size weights nullptr or array of size index_size out of size output_size * block_size sum(lengths[i]) == index_size

Behavior is roughly equivalent to pseudocode:

pos = 0 for (i = 0..index_size-1) for (k = 0..block_size-1) out[i*block_size + k] = 0 for (j = 0..lengths[i]-1) for (k = 0..block_size-1) out[i*block_size + k] += input[indices[pos]*block_size + k] * (weights ? weights[IS_WEIGHT_POSITIONAL ? j : pos] : 1.0) pos += 1 if (normalize_weights && lengths[i] > 0) for (k = 0..block_size-1) out[i*block_size + k] /= lengths[i]

fbgemm::CompressedSparseColumn* caffe2::ExtractOutlierMatrix	(	int	groups,
		int	kernel_dim,
		int	M,
		int	nbits_in_non_outlier,
		vector< std::int8_t > &	W_quantized
	)

Parameters

W_quantized input quantized weight that is not packed yet

template<typename IndexType , typename InType , typename OutType , bool IS_WEIGHT_POSITIONAL = false>

void caffe2::Fused8BitRowwiseEmbeddingLookup	(	const std::int64_t	block_size,
		const std::int64_t	output_size,
		const std::int64_t	index_size,
		const std::int64_t	data_size,
		const InType *	input,
		const IndexType *	indices,
		const int *	lengths,
		const float *	weights,
		bool	normalize_by_lengths,
		OutType *	out
	)

Embedding lookup with reduction.

input of size data_size * (block_size + 8B) indices of size index_size lengths of size output_size weights nullptr or array of size index_size out of size output_size * block_size sum(lengths[i]) == index_size

Note that block_size should be the number of quantized values per row in the data, i.e. excluding the scale and bias. The total (fused) block size is assumed to be this block_size, plus 4 bytes for scale and 4 bytes for bias.

Behavior is roughly equivalent to pseudocode:

pos = 0 fused_block_size = block_size + 8B // quantized values and scale and bias for (i = 0..index_size-1) for (k = 0..block_size-1) out[i*block_size + k] = 0 for (j = 0..lengths[i]-1) for (k = 0..block_size-1) out[i*block_size + k] += input[indices[pos]*(fused_block_size) + k] * (weights ? weights[IS_WEIGHT_POSITIONAL ? j : pos] : 1.0) pos += 1 if (normalize_weights && lengths[i] > 0) for (k = 0..block_size-1) out[i*block_size + k] /= lengths[i]

void caffe2::Get1DPartitionOf2D	(	int	m,
		int	n,
		int	nthreads,
		int	thread_id,
		int *	m_begin,
		int *	m_end,
		int *	n_begin,
		int *	n_end,
		int	n_align = `1`
	)

1D-partition m x n 2D work.

First try partitioning m if m >= nthreads. Otherwise, each row is partitioned by multiple threads. In this case, each thread only works on a single row. Optionally, we can force the number of columns assigned per thread is a multiple of n_align.

Definition at line 20 of file dnnlowp_partition.cc.

CAFFE2_CUDA_API CudaMemoryPoolType caffe2::GetCudaMemoryPoolType ( )

Gets the current memory pool type used by Caffe2.

The memory pool is set up during caffe2's global initialization time.

CAFFE2_CUDA_API bool caffe2::GetCudaPeerAccessPattern ( vector< vector< bool >> * pattern )

Return a peer access pattern by returning a matrix (in the format of a nested vector) of boolean values specifying whether peer access is possible.

This function returns false if anything wrong happens during the query of the GPU access pattern.

CAFFE2_CUDA_API const cudaDeviceProp & caffe2::GetDeviceProperty ( const int device )

Gets the device property for the given device.

This function is thread safe.

Definition at line 139 of file common_gpu.cc.

template<typename ACC_T >

std::shared_ptr<fbgemm::PackBMatrix<int8_t, ACC_T> > caffe2::GetOrCreateFbgemmPackBMatrix	(	fbgemm::matrix_op_t	trans,
		std::int32_t	m,
		std::int32_t	n,
		const void *	orig_data,
		const std::int8_t *	quantized_data,
		std::int32_t	ld
	)

If there's an existing packed matrix for the same matrix, reuse it.

Create a new one otherwise. This can save memory usage if many threads are sharing the same weight.

CAFFE2_API bool caffe2::GlobalInit	(	int *	pargc,
		char ***	argv
	)

Initialize the global environment of caffe2.

Caffe2 uses a registration pattern for initialization functions. Custom initialization functions should take the signature bool (func)(int, char***) where the pointers to argc and argv are passed in. Caffe2 then runs the initialization in three phases: (1) Functions registered with REGISTER_CAFFE2_EARLY_INIT_FUNCTION. Note that since it is possible the logger is not initialized yet, any logging in such early init functions may not be printed correctly. (2) Parses Caffe-specific commandline flags, and initializes caffe logging. (3) Functions registered with REGISTER_CAFFE2_INIT_FUNCTION. If there is something wrong at each stage, the function returns false. If the global initialization has already been run, the function returns false as well.

GlobalInit is re-entrant safe; a re-entrant call will no-op and exit.

GlobalInit is safe to call multiple times but not idempotent; successive calls will parse flags and re-set caffe2 logging levels from flags as needed, but NOT re-run early init and init functions.

GlobalInit is also thread-safe and can be called concurrently.

Definition at line 44 of file init.cc.

CAFFE2_API bool caffe2::GlobalInit ( )

Initialize the global environment without command line arguments.

This is a version of the GlobalInit where no argument is passed in. On mobile devices, use this global init, since we cannot pass the command line options to caffe2, no arguments are passed.

Definition at line 93 of file init.cc.

CAFFE2_API MPI_Comm caffe2::GlobalMPIComm ( )

Gets the global MPI communicator used by Caffe2.

In default, this is MPI_COMM_WORLD unless you call SetGlobalMPIComm().

Definition at line 20 of file mpi_common.cc.

bool caffe2::HasCudaGPU ( )

inline

Check if the current running session has a cuda gpu present.

Note that this is different from having caffe2 built with cuda. Building Caffe2 with cuda only guarantees that this function exists. If there are no cuda gpus present in the machine, or there are hardware configuration problems like an insufficient driver, this function will still return false, meaning that there is no usable GPU present.

In the open source build, it is possible that Caffe2's GPU code is dynamically loaded, and as a result a library could be only linked to the CPU code, but want to test if cuda is later available or not. In this case, one should use HasCudaRuntime() from common.h.

Definition at line 149 of file common_gpu.h.

CAFFE2_API void caffe2::LoadModule	(	const string &	name,
		const string &	filename = `""`
	)

Load a module.

Inputs: name: a module name or a path name. It is recommended that you use the name of the module, and leave the full path option to only experimental modules. filename: (optional) a filename that serves as a hint to load the module.

Definition at line 52 of file module.cc.

template<typename F >

detail::ScopeGuardImplDecay<F> caffe2::MakeGuard ( F && f )

noexcept

ScopeGuard is a general implementation of the "Initialization is Resource Acquisition" idiom.

Basically, it guarantees that a function is executed upon leaving the currrent scope unless otherwise told.

The MakeGuard() function is used to create a new ScopeGuard object. It can be instantiated with a lambda function, a std::function<void()>, a functor, or a void(*)() function pointer.

Usage example: Add a friend to memory iff it is also added to the db.

void User::addFriend(User& newFriend) { // add the friend to memory friends_.push_back(&newFriend);

// If the db insertion that follows fails, we should // remove it from memory. auto guard = MakeGuard([&] { friends_.pop_back(); });

// this will throw an exception upon error, which // makes the ScopeGuard execute UserCont::pop_back() // once the Guard's destructor is called. db_->addFriend(GetName(), newFriend.GetName());

// an exception was not thrown, so don't execute // the Guard. guard.dismiss(); }

Examine ScopeGuardTest.cpp for some more sample usage.

Stolen from: Andrei's and Petru Marginean's CUJ article: http://drdobbs.com/184403758 and the loki library: http://loki-lib.sourceforge.net/index.php?n=Idioms.ScopeGuardPointer and triendl.kj article: http://www.codeproject.com/KB/cpp/scope_guard.aspx

Definition at line 153 of file scope_guard.h.

CAFFE2_API bool caffe2::MatchStrings	(	string	p,
		string	s
	)

This allows for the use of * and | to match operator types, engines, or any other property that is represented by strings.

For example, if we wanted to match an operator to Conv or FC, we can give: "Conv|FC" as the type() of that op.

Definition at line 214 of file graph.cc.

void caffe2::MPISetupPeers	(	const int	replicas,
		const string &	role,
		const string &	job_path
	)

A function used to perform peer setup so one does not need to use mpirun / mpiexec to run the binary.

Note that if you use mpirun or mpiexec to set up the common world, do not use this function - MPI_Init would have already set that up.

This also assumes that you have a common path (like NFS) that multiple instances can read from.

Inputs: replicas (int): the number of replicas that mpi will run with. role (string): the role of this process, "server" or "client". job_path (string): a file name that the server will write its port into and the clients will read the server's port from.

Definition at line 94 of file mpi_common.cc.

CAFFE2_API void caffe2::SerializeBlob	(	const Blob &	blob,
		const string &	name,
		BlobSerializerBase::SerializationAcceptor	acceptor,
		int	chunk_size = `kDefaultChunkSize`
	)

Serializes the given blob, if possible.

Note that this serialization uses the registration mechanism and one has to implement specific serialization approaches for specific classes. Acceptor should take care of writing data to the actual storage.

Definition at line 92 of file blob_serialization.cc.

CAFFE2_API string caffe2::SerializeBlob	(	const Blob &	blob,
		const string &	name
	)

Convenience function to serialize a blob to a string.

This is a conveinence function to serialize small Blobs that produce manageable serialized strings. To serialize big blobs such as large sparse tensors, use the fully-functional interface in blob_serializer_base.h.

NOTE: this function doesn't do chunking and might break with big tensors.

Definition at line 100 of file blob_serialization.cc.

CAFFE2_API void caffe2::SetGlobalMPIComm ( MPI_Comm new_comm )

Sets the global MPI communicator.

Caffe2 takes over the ownership of the passed in communicator.

Definition at line 24 of file mpi_common.cc.

template<typename SIndex >

int caffe2::sparse_adagrad	(	int	num_rows,
		int	block_size,
		std::uint64_t	param_size,
		const float *	w,
		const float *	g,
		const float *	h,
		const SIndex *	indices,
		float *	nw,
		float *	nh,
		float	epsilon,
		float	lr
	)

Returns: num_rows if succeeds otherwise return the row idx where we pass the boundary of param_size

template<typename T >

Tensor caffe2::TensorCPUFromValues	(	at::IntArrayRef	dims,
		at::ArrayRef< T >	values
	)

Creates a CPU tensor, and fills its contents with the given values.

Values are copied in

Definition at line 663 of file tensor.h.

Variable Documentation

it will be coerced into one For an arbitrary n dimensional tensor X where k is the axis then X will be coerced into a dimensional tensor with dimensions [(a_0 * ... * a_{k-1}), (a_k * ... * a_{n-1})] For the default case where the X tensor will be coerced into a tensor of where $a_0 is often the batch size In this we must hav caffe2::$a_0)

Initial value:

= N$ and $a_1 * ... * a_{n-1} = D$. Each of these

dimensions must be matched correctly

D

Definition: static.cpp:70

Definition at line 123 of file softmax_op.cc.

it will be coerced into one For an arbitrary n dimensional tensor X in [a_0, a_1, ..., a_{k-1}, a_k, ..., a_{n-1}] and k is the axis then X will be coerced into a dimensional tensor with dimensions [a_0 * ... * a_{k-1}, a_k * ... * a_{n-1}] For the default case where this means the X tensor will be coerced into a tensor of where a_0 is often the batch size In this we must have caffe2::a_0

Initial value:

= N and a_1 * ... * a_{n-1} = D.

Each of these dimensions must be matched correctly

D

Definition: static.cpp:70

Definition at line 26 of file int8_softmax_op.cc.

where indicates that the corresponding sample should be ignored caffe2::and

Initial value:

{0, 1} correspond to the binary classes 0 and 1. By
default the loss is divided by the number of targets > -1 and then multiplied by
the `scale` op argument. The divisive normalization may be disable by setting
the op argument `normalize` to 0 (the multiplication by `scale` still takes
effect).
This op fuses sigmoid and cross entropy for numerical stability in both forward
and gradient computation.
)DOC")
    .Arg(
        "scale",
        "(float) default 1.0

Definition at line 34 of file sigmoid_cross_entropy_loss_op.cc.

auto caffe2::batch_size

Initial value:

=

size_to_dim_(canonical_axis, GetDimsVector(logits))

Definition at line 26 of file softmax_with_loss_op.cc.

default must must caffe2::be

Initial value:

= 0.");
    .NumInputs(1)
    .NumOutputs(0, 1)
    .SetDoc(R"DOC(
Resets a count-down counter with initial value specified by the `init_count`
argument.
)DOC" + (string) githubLinks + (string) kCountExample)
    .Input(
        0,
        "counter",
        "*(type: Tensor`<ptr>`)* A blob pointing to an instance of a counter.")
    .Output(
        0,
        "previous_value",
        "*(type: int)* [OPTIONAL] count value BEFORE this operation.")
    .Arg(
        "init_count",
        "*(type: int; default: 0)* Resets counter to this value

Definition at line 210 of file counter_ops.cc.

this op outputs a copy of the input tensor where values from the height and width dimensions are moved to the batch dimension After the zero padding is according to the pad both height and width of the input must be divisible by the block_size Only NCHW order is currently supported Github caffe2::block_size

Initial value:

=3
)
workspace.FeedBlob("X", np.random.rand(1,3,5,5).astype(np.float32))
print("X.shape:", workspace.FetchBlob("X").shape)
workspace.RunOperatorOnce(op)
print("Y.shape:", workspace.FetchBlob("Y").shape)
```
**Result**
```
X.shape: (1, 3, 5, 5)
Y.shape: (9, 3, 3, 3)
```
</details>
)DOC")
    .Arg("pad","(*int*): exclusive axis that divides the first and second dimension of matrix `A` (default=0)")
    .Arg("block_size","(*int*): height/width of spatial blocks to be moved (default=2)")
    .Arg("order","(*string*): order of dimensions of input and output blobs

Definition at line 27 of file space_batch_op.cc.

const auto caffe2::canonical_axis

Initial value:

=

canonical_axis_index_(axis, in[0].dims().size())

Definition at line 147 of file layer_norm_op.cc.

constexpr DeviceType caffe2::COMPILE_TIME_MAX_DEVICE_TYPES

Initial value:

=

DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES

Definition at line 16 of file caffe2_pb.h.

int caffe2::default

Initial value:

=\"\") Serialized ONNX model to be converted to backend representation")
    .Arg(
        "initializers",
        "Initialization pair indicating the mapping of the name between NetDef and ONNX model")

Definition at line 103 of file arg_ops.cc.

auto caffe2::dilation_h

Initial value:

= helper.GetSingleArgument<int>(

"dilation_h", helper.GetSingleArgument<int>("dilation", 1))

Definition at line 40 of file im2col_op.cc.

auto caffe2::dilation_w

Initial value:

= helper.GetSingleArgument<int>(

"dilation_w", helper.GetSingleArgument<int>("dilation", 1))

Definition at line 42 of file im2col_op.cc.

given param caffe2::else

Initial value:

{
            CAFFE_ENFORCE_EQ(
                totalSize,
                size,
                "Argument `shape` does not agree with the input data.",
                " (",
                totalSize,
                " != ",
                size,
                ")")

Definition at line 88 of file reshape_op.cc.

const char* caffe2::githubLinks

Initial value:

= R"DOC(
  Github Links:
  - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/counter_ops.cc
)DOC"

Definition at line 6 of file counter_ops.cc.

const vector< TensorShape > & caffe2::in

Initial value:

{

vector<TensorShape> out(1)

Definition at line 12 of file batch_gather_ops.cc.

const char caffe2::kAveragePoolDoc_int8[]

Initial value:

= R"DOC(
consumes an input blob X and applies average pooling across the
the blob according to kernel sizes, stride sizes, and pad lengths defined by the
ConvPoolOpBase operator. Average pooling consisting of averaging all values of a
subset of the input tensor according to the kernel size and downsampling the
data into the output blob Y for further processing.
)DOC"

Definition at line 12 of file int8_average_pool_op.cc.

const char caffe2::kConvDoc_int8[]

Initial value:

= R"DOC(
[Only NHWC order is supported now]Note that other parameters, such as the stride and
kernel size, or the pads' sizes in each direction are not necessary for input
because they are provided by the ConvPoolOpBase operator. Various dimension
checks are done implicitly, and the sizes are specified in the Input docs for
this operator. As is expected, the filter is convolved with a subset of the
image and the bias is added; this is done throughout the image data and the
output is computed. As a side note on the implementation layout:
conv_op_impl.h is the templated implementation of the conv_op.h file, which is
why they are separate files.
)DOC"

Definition at line 7 of file int8_conv_op.cc.

const char* caffe2::kConvFusionDoc

Initial value:

= R"DOC(
Note that other parameters, such as the stride and
kernel size, or the pads' sizes in each direction are not necessary for input
because they are provided by the ConvPoolOpBase operator. Various dimension
checks are done implicitly, and the sizes are specified in the Input docs for
this operator. As is expected, the filter is convolved with a subset of the
image and the bias is added; this is done throughout the image data and the
output is computed. As a side note on the implementation layout:
conv_op_impl.h is the templated implementation of the conv_op.h file, which is
why they are separate files.
)DOC"

Definition at line 151 of file conv_fusion_op.cc.

auto caffe2::kernel_h

Initial value:

= helper.GetSingleArgument<int>(

"kernel_h", helper.GetSingleArgument<int>("kernel", 0))

Definition at line 36 of file im2col_op.cc.

auto caffe2::kernel_w

Initial value:

= helper.GetSingleArgument<int>(

"kernel_w", helper.GetSingleArgument<int>("kernel", 0))

Definition at line 38 of file im2col_op.cc.

const char caffe2::kMaxPoolDoc_int8[]

Initial value:

= R"DOC(
consumes an input blob X and applies max pooling across the
the blob according to kernel sizes, stride sizes, and pad lengths defined by the
ConvPoolOpBase operator. Max pooling consisting of taking the maximum value of a
subset of the input tensor according to the kernel size and downsampling the
data into the output blob Y for further processing.
)DOC"

Definition at line 10 of file int8_max_pool_op.cc.

where segments are defined by their and concatenate them in an output tensor of the output value will be padded and the corresponding output indices will be padded by DOC caffe2::LENGTHS

Initial value:

= [0, 1, 1, 1]

and target_length = 2

Definition at line 20 of file lengths_pad_op.cc.

given a parameter tensor X and its gradient the local learning rate for X will be caffe2::local_lr

Initial value:

= trust * norm(X) / ( norm(dX) + wd * norm(X) + offset * norm(X) )
      = trust / ( norm(dX) / norm(X) + wd + offset )

Definition at line 33 of file lars_op.cc.

constexpr int caffe2::MaxDeviceTypes

Initial value:

=

DeviceTypeProto::PROTO_COMPILE_TIME_MAX_DEVICE_TYPES

Definition at line 13 of file event.h.

std::vector<int> caffe2::newDims

Initial value:

=

SqueezeOp<CPUContext>::ComputeDims(GetDimsVector(in[0]), dims)

Definition at line 125 of file expand_squeeze_dims_op.cc.

auto caffe2::num_classes

Initial value:

=

size_from_dim_(canonical_axis, GetDimsVector(logits))

c10::size_from_dim_

int64_t size_from_dim_(int k, IntArrayRef dims)

Return product of all dimensions starting from k.

Definition: TensorImpl.h:53

Definition at line 28 of file softmax_with_loss_op.cc.

with the size where caffe2::num_feature

Initial value:

= F (F >= 1).
    For each feature

Definition at line 14 of file bisect_percentile_op.cc.

auto caffe2::order

Initial value:

= StringToStorageOrder(

helper.GetSingleArgument<string>("order", "NCHW"))

Definition at line 17 of file do_op.cc.

std::vector< int64_t > caffe2::output_dims

Initial value:

=
          caffe2::gather_helper::calc_output_shape_vector<int>(
              data_dims, indices_dims, 1)

Definition at line 18 of file batch_gather_ops.cc.

Tensor of rank caffe2::r

Initial value:

= 2.")

.Input(1, "INDICES", "Tensor of int32/int64 indices

Tensor

Definition: ios_caffe_predictor.h:9

Definition at line 25 of file batch_gather_ops.cc.

d_n then the output will have A Int8 tensor of caffe2::rank

Initial value:

= axis.")
    .Output(
        0,
        "output",
        "A 2D Int8 tensor with the contents of the input tensor

Definition at line 10 of file lengths_pad_op.cc.

auto caffe2::stride_h

Initial value:

= helper.GetSingleArgument<int>(

"stride_h", helper.GetSingleArgument<int>("stride", 1))

Definition at line 44 of file im2col_op.cc.

auto caffe2::stride_w

Initial value:

= helper.GetSingleArgument<int>(

"stride_w", helper.GetSingleArgument<int>("stride", 1))

Definition at line 46 of file im2col_op.cc.

int caffe2::tiles

Initial value:

=

helper.GetSingleArgument<std::int32_t>("tiles", 1)

Definition at line 94 of file tile_op.cc.

auto caffe2::valid_axes

Initial value:

=
            std::all_of(axes.begin(), axes.end(), [&tensor_size](int& axis) {
              return axis >= 0 && axis < tensor_size;
            })

Definition at line 24 of file transpose_op.cc.

reconstruct values together according to masks A comprehensive False False True Reconstruct Note that for all mask there must be at least one True This is not False False we accept the first and no longer expect a value for that False True caffe2::values2

Initial value:

= 2.0

mask3 = False

Definition at line 67 of file boolean_unmask_ops.cc.

return caffe2::vector< TensorShape >

Initial value:

{

CreateTensorShape(dst_sizes, data.data_type())}

Definition at line 109 of file slice_op.cc.

the cap of output caffe2::y

Initial value:

== 1 then it assumed the first input should be ranked higher

(have a larger value) than the second input

Definition at line 79 of file margin_ranking_criterion_op.cc.

integer upsampling factor caffe2::Y

Initial value:

= X.")
    .Input(0, "X", "*(type: Tensor`<float>`)* Input data tensor.")
    .Output(0, "Y", "*(type: Tensor`<float>`)* Output tensor.")
    .Output(
        1,
        "mask",
        "*(type: Tensor`<bool>`)* The output mask containing boolean values for"
        "each element

Definition at line 119 of file cast_op.cc.

Namespaces

Data Structures

Typedefs

Enumerations

Functions

Variables

Detailed Description

Function Documentation

Variable Documentation

Facebook Open Source