Скачиваний:
28
Добавлен:
01.05.2014
Размер:
19.55 Кб
Скачать
unit Discretize;

interface
uses
Range,
dmmTypes,
instances,
Exceptions,
dmmConstants,
SysUtils,
FastVector,
stringObject,
attribute,
Contnrs,
Filter,
dmm;
type
TDMDiscretize = class(TDMFilter)
public
//столбцы для дискретизации
m_DiscretizeCols : TDMRange;

//число интервалов
m_NumBins : integer;

//вес векторов на интервале
m_DesiredWeightOfInstancesPerInterval : double;

//граничные точки
m_CutPoints : D2DArray;

//граничные точки, ручное разбиение
m_ManualCutPoints : D2DArray;

//найти оптимальное число интервалов
m_FindNumBins : boolean;

//использовать равномерное разбиение
m_UseEqualFrequency : boolean;

//столбцы для дискретизации по умолчанию
m_DefaultCols : AnsiString;

m_pointsCalculated : boolean;
protected

public
constructor Create();overload; override;
constructor Create(cols : AnsiString );overload;
procedure setAttributeIndices(rangeList : AnsiString );
procedure DefaultSettings();
function setInputFormat(instanceInfo : TDMInstances ) : boolean ; override;
function getFindNumBins() : boolean;
function getUseEqualFrequency() : boolean;
function input( instance : TDMInstance) : boolean;
function batchFinished() : boolean;
function getDesiredWeightOfInstancesPerInterval() : double;
function getBins() : integer;
function getAttributeIndices() : AnsiString;
function useFilter(data : TDMInstances) : TDMInstances; override;
procedure setFindNumBins(newFindNumBins : boolean); override;
protected
procedure calculateCutPoints();
procedure findNumBins(index : integer);
procedure calculateCutPointsByEqualWidthBinning(index : integer);
procedure calculateCutPointsByEqualFrequencyBinning(index : integer);
procedure setOutputFormat();
procedure convertInstance(instance : TDMInstance );
procedure setFilterOutputFormat(outputFormat : TDMInstances );
public
procedure ClearQueue(); overload;
destructor Destroy; override;
end;


implementation
constructor TDMDiscretize.Create();
begin
inherited Create;
DefaultSettings();
m_DefaultCols := 'first-last';
setAttributeIndices('first-last');
end;

constructor TDMDiscretize.Create(cols : AnsiString );
begin
inherited Create();
DefaultSettings();
m_DefaultCols := cols;
setAttributeIndices(cols);
end;

procedure TDMDiscretize.setAttributeIndices(rangeList : AnsiString );
begin
m_DiscretizeCols.setRanges(rangeList);
end;

procedure TDMDiscretize.DefaultSettings();
begin
m_DiscretizeCols := TDMRange.Create();
m_NumBins := 10;
m_DesiredWeightOfInstancesPerInterval := -1;
m_CutPoints := nil;

m_FindNumBins := false;
m_UseEqualFrequency := false;
m_pointsCalculated := false;
end;

function TDMDiscretize.getFindNumBins() : boolean;
begin
result := m_FindNumBins;
end;

function TDMDiscretize.getUseEqualFrequency() : boolean;
begin
result := m_UseEqualFrequency;
end;

function TDMDiscretize.getDesiredWeightOfInstancesPerInterval() : double;
begin
result := m_DesiredWeightOfInstancesPerInterval;
end;

function TDMDiscretize.setInputFormat(instanceInfo : TDMInstances ) : boolean ;
begin
inherited setInputFormat(instanceInfo);

m_DiscretizeCols.setUpper(instanceInfo.numAttributes() - 1);
m_CutPoints := nil;

if getFindNumBins() and getUseEqualFrequency() then
raise EIllegalArgumentException.Create('Не допускается одновременно задавать равномерное разбиение и автоматический поиск числа интервалов');

result := false;
end;

function TDMDiscretize.input( instance : TDMInstance) : boolean;
begin
if (getInputFormat() = nil) then
begin
raise EIllegalStateException.Create('Не определен формат входной выборки');
end;
if (m_NewBatch) then
begin
resetQueue();
m_NewBatch := false;
end;

if (m_CutPoints <> nil) then
begin
convertInstance(instance);
result := true;
exit;
end;

bufferInput(instance);
result := false;
exit;
end;

procedure TDMDiscretize.findNumBins(index : integer);
var
min, max, binWidth,entropy,bestEntropy,currentVal: double;
distribution : DArray;
bestNumBins : integer;
currentInstance : TDMInstance;
i,j,k : integer;
cutPoints : DArray;
lCond,rCond : double;
lr : double;
eps : double;
begin
eps := 0.0000001;
min := MAX_VALUE;
max := -MIN_VALUE;
binWidth := 0;
bestEntropy := MAX_VALUE;
bestNumBins := 1;
i := 0;

// поиск минимума и максимума
while ( i < getInputFormat().numInstances()) do
begin
currentInstance := getInputFormat().instance(i);
if not (currentInstance.isMissing(index)) then
begin
currentVal := currentInstance.value(index);
if (currentVal > max) then
max := currentVal;
if (currentVal < min) then
min := currentVal;
end;

inc(i);
end;

// поиск числа интервалов
i := 0;
while (i < m_NumBins) do
begin
j:=0;
while (j < length(distribution)) do
begin
distribution[j]:=0;
inc(j);
end;
SetLength(distribution,i + 1);
binWidth := (max - min) / (i + 1);

// распределение
j := 0;
while (j < getInputFormat().numInstances()) do
begin
currentInstance := getInputFormat().instance(j);
if not (currentInstance.isMissing(index)) then
begin
k := 0;
while (k < i + 1) do
begin
lCond :=currentInstance.value(index);
rCond := min + ((k + 1) * binWidth);
lr := abs(lcond-rcond);

if (lCond <= rCond) or (lr < eps) then
begin
distribution[k] := distribution[k] + currentInstance.weight();
break;
end;
inc(k);
end;
end;
inc(j);

end;

// энтропия
entropy := 0;
k := 0;
while (k < i + 1) do
begin
if (distribution[k] < 2) then
begin
entropy := MAX_VALUE;
break;
end;
if (binWidth = 0) then entropy := IMIN_VALUE else
entropy := entropy - distribution[k] * ln((distribution[k] - 1) / binWidth);
inc(k);
end;

if (entropy < bestEntropy) then
begin
bestEntropy := entropy;
bestNumBins := i + 1;
end;
inc(i);
end;

// формируем точки
cutPoints := nil;
if (bestNumBins > 1) and (binWidth > 0) then
begin
SetLength(cutPoints,bestNumBins - 1);
i := 1;
while (i < bestNumBins) do
begin
cutPoints[i - 1] := min + binWidth * i;
inc(i);
end;
end;
j := 0;
SetLength(m_CutPoints[index],length(cutPoints));
while j < length(cutPoints) do
begin
m_CutPoints[index,j] := cutPoints[j];
inc(j);
end;
SetLength(cutPoints, 0);
SetLength(distribution, 0);
end;

procedure TDMDiscretize.calculateCutPointsByEqualWidthBinning(index : integer);
var
max,min,currentVal : double;
currentInstance : TDMInstance;
binWidth : double;
cutPoints : DArray;
i,j : integer;
freq : double;
begin
max := 0;
min := 1;

i := 0;
while(i < getInputFormat().numInstances()) do
begin
currentInstance := getInputFormat().instance(i);
if not (currentInstance.isMissing(index)) then
begin
currentVal := currentInstance.value(index);
if (max < min) then
begin
max := currentVal;
min := currentVal;
end;
if (currentVal > max) then max := currentVal;
if (currentVal < min) then min := currentVal;
end;
inc(i);

end;
binWidth := (max - min) / m_NumBins;
cutPoints := nil;
if (m_NumBins > 1) and (binWidth > 0) then
begin
SetLength(cutPoints,m_NumBins - 1);
i := 1;
while (i < m_NumBins) do
begin
cutPoints[i - 1] := min + binWidth * i;
inc(i);
end;
end;

j := 0;
SetLength(m_CutPoints[index],length(cutPoints));
while j < length(cutPoints) do
begin
m_CutPoints[index,j] := cutPoints[j];
inc(j);
end;
SetLength(cutPoints, 0);
end;

procedure TDMDiscretize.calculateCutPointsByEqualFrequencyBinning(index : integer);
var
data : TDMInstances;
sumOfWeights : double;
i,j : integer;
freq : double;
cutPoints,cp : DArray;
counter,last : double;
cpindex, lastIndex : integer;
begin
data := TDMInstances.Create(getInputFormat());

data.sort(index);

sumOfWeights := 0;

i := 0;
while ( i < data.numInstances()) do
begin
if (data.instance(i).isMissing(index)) then
break
else
sumOfWeights := sumOfWeights + data.instance(i).weight();
inc(i);
end;

SetLength(cutPoints,m_NumBins - 1);
if (getDesiredWeightOfInstancesPerInterval() > 0) then
begin
freq := getDesiredWeightOfInstancesPerInterval();
SetLength(cutPoints,trunc(sumOfWeights / freq));
end
else
begin
freq := sumOfWeights / m_NumBins;
SetLength(cutPoints,m_NumBins - 1);
end;

counter := 0;
last := 0;
cpindex := 0;
lastIndex := -1;

i := 0;
while ( i < data.numInstances() - 1) do
begin
if (data.instance(i).isMissing(index)) then
break;

counter := counter + data.instance(i).weight();
sumOfWeights := sumOfWeights - data.instance(i).weight();

// имеется ли потенциальная точка?
if (data.instance(i).value(index) < data.instance(i + 1).value(index)) then
begin

// достаточная длина?
if (counter >= freq) then
begin

// эта точка хуже предыдущей?
if ((freq - last) < (counter - freq)) and (lastIndex <> -1) then
begin
cutPoints[cpindex] := (data.instance(lastIndex).value(index) +
data.instance(lastIndex + 1).value(index)) / 2;
counter := counter - last;
last := counter;
lastIndex := i;
end else
begin
cutPoints[cpindex] := (data.instance(i).value(index) +
data.instance(i + 1).value(index)) / 2;
counter := 0;
last := 0;
lastIndex := -1;
end;
inc(cpindex);
freq := (sumOfWeights + counter) / ((length(cutPoints) + 1) - cpindex);
end
else
begin
lastIndex := i;
last := counter;
end;
end;
inc(i);
end;

if (cpindex < length(cutPoints)) and (lastIndex <> -1) then
begin
cutPoints[cpindex] := (data.instance(lastIndex).value(index) +
data.instance(lastIndex + 1).value(index)) / 2;
inc(cpindex);
end;

// точки найдены?
if (cpindex = 0) then
begin
m_CutPoints[index] := nil;
end else
begin
SetLength(cp,cpindex);
i := 0;
while (i < cpindex) do
begin
cp[i] := cutPoints[i];
inc(i);
end;

j := 0;
SetLength(m_CutPoints[index],length(cp));
while j < length(cp) do
begin
m_CutPoints[index,j] := cp[j];
inc(j);
end;
end;
SetLength(cutPoints, 0);
SetLength(cp, 0);
FreeAndNil(data);
end;


procedure TDMDiscretize.calculateCutPoints();
var
i : integer;
begin
SetLength(m_CutPoints,getInputFormat().numAttributes());
i := getInputFormat().numAttributes() - 1;
while ( i >= 0) do
begin
if (m_DiscretizeCols.isInRange(i)) and (getInputFormat().attribute(i).isNumeric()) and
(getInputFormat().classIndex() <> i) then
begin
if (m_FindNumBins) then
findNumBins(i)
else
if not (m_UseEqualFrequency) then
calculateCutPointsByEqualWidthBinning(i)
else
calculateCutPointsByEqualFrequencyBinning(i);
end;
dec(i);
end;
end;


function TDMDiscretize.getBins() : integer;
begin
result := m_NumBins;
end;


function TDMDiscretize.getAttributeIndices() : AnsiString;
begin
result := m_DiscretizeCols.getRanges();
end;

procedure TDMDiscretize.setOutputFormat();
var
attributes,attribValues : TDMFastVector ;
classIndex : integer;
i,j : integer;
curString,curString1 : AnsiString;
outputFormat : TDMInstances;

begin

if (m_CutPoints = nil) then
begin
setFilterOutputFormat(nil);
exit;
end;
attributes := TDMFastVector.Create(getInputFormat().numAttributes());
classIndex := getInputFormat().classIndex();

i := 0;
while (i < getInputFormat().numAttributes()) do
begin
if ((m_DiscretizeCols.isInRange(i)) and
(getInputFormat().attribute(i).isNumeric()) and
(getInputFormat().classIndex() <> i)) then
begin
attribValues := TDMFastVector.Create(1);
if (m_CutPoints[i] = nil) then
attribValues.addElement(TDMNominalAttributeValue.Create(''''+'All'+''''))
else
begin
j := 0;
while (j <= length(m_CutPoints[i])) do
begin
if (j = 0) then
begin
Str(m_CutPoints[i][j]:4:2,curString);
attribValues.addElement(TDMNominalAttributeValue.Create(''''+'(-inf-'
+ curString + ']' + ''''));
end else
if (j = length(m_CutPoints[i])) then
begin
Str(m_CutPoints[i][j - 1]:4:2,curString);
attribValues.addElement(TDMNominalAttributeValue.Create(''''+ '('
+ curString + '-inf)' + ''''));
end else
begin
Str(m_CutPoints[i][j - 1]:4:2,curString);
Str(m_CutPoints[i][j]:4:2,curString1);
attribValues.addElement(TDMNominalAttributeValue.Create(''''+'('
+ curString + '-' + curString1 + ']' + ''''));
end;

inc(j);
end;
end;
attributes.addElement(TDMAttribute.Create(getInputFormat().
attribute(i).name(),
attribValues));
FreeAndNil(attribValues);
end else
attributes.addElement(getInputFormat().attribute(i).copyObject() as TDMAttribute);

inc(i);
end;
outputFormat := TDMInstances.Create(getInputFormat().relationName(), attributes, 0);
FreeAndNil(attributes);
outputFormat.setClassIndex(classIndex);
setFilterOutputFormat(outputFormat);
end;

procedure TDMDiscretize.convertInstance(instance : TDMInstance );
var
index, i,j : integer;
vals : DArray;
currentVal : double;
inst : TDMInstance;

begin
index := 0;
SetLength(vals,outputFormatPeek().numAttributes());

i := 0;
while (i < getInputFormat().numAttributes()) do
begin
if m_DiscretizeCols.isInRange(i) and getInputFormat().attribute(i).isNumeric() and
(getInputFormat().classIndex() <> i) then
begin
currentVal := instance.value(i);
if (m_CutPoints[i] = nil) then
begin
if (instance.isMissing(i)) then
vals[index] := missing_Value
else
vals[index] := 0;
inc(index);
end else
begin
if (instance.isMissing(i)) then
vals[index] := missing_Value
else
begin
j := 0;
while ( j < length(m_CutPoints[i])) do
begin
if (currentVal <= m_CutPoints[i][j]) then
break;
inc(j);
end;
vals[index] := j;
end;
inc(index);
end;
end else
begin
vals[index] := instance.value(i);
inc(index);
end;
inc(i);
end;

inst := nil;
inst := TDMInstance.Create(instance.weight(), TDMInstanceValues(vals),instance.size() );

inst.setDataset(getOutputFormat());
push(inst);
SetLength(vals,0);

end;

function TDMDiscretize.batchFinished() : boolean;
var
i,j,k : integer;
begin
if (getInputFormat() = nil) then
raise EIllegalStateException.Create('Не определен формат входной выборки');

if (m_CutPoints = nil) then
begin
if not m_pointsCalculated then
calculateCutPoints()
else
begin
k := getInputFormat().numAttributes() - 1;
while ( k >= 0) do
begin
if (m_DiscretizeCols.isInRange(k)) and (getInputFormat().attribute(k).isNumeric()) and
(getInputFormat().classIndex() <> k) then
begin
j := 0;
SetLength(m_CutPoints,Length(m_ManualCutPoints));
SetLength(m_CutPoints[k],Length(m_ManualCutPoints[k]));
while (j < length(m_CutPoints[k])) do
begin
m_CutPoints[k,j] := m_ManualCutPoints[k,j];
inc(j);
end;
break;
end;
dec(k);
end;
end;

setOutputFormat();

i := 0;
while (i < getInputFormat().numInstances()) do
begin
convertInstance(getInputFormat().instance(i));
inc(i);
end;
end;

flushInput();

m_NewBatch := true;

result :=(numPendingOutput() <> 0);
end;

function TDMDiscretize.useFilter(data : TDMInstances) : TDMInstances;
var
i : integer;
newData : TDMInstances;
processed : TDMInstance;
begin
i := 0;
while (i < data.numInstances()) do
begin
self.input(data.instance(i));
inc(i);
end;
self.batchFinished();

newData := self.getOutputFormat();

processed := self.output();

while (processed <> nil) do
begin
newData.add(processed);
FreeAndNil(processed);
processed := self.output();
end;

result := newData;
end;

procedure TDMDiscretize.setFilterOutputFormat(outputFormat : TDMInstances);
var
relationName, options : AnsiString;
begin
if (outputFormat <> nil) then
begin
FreeAndNil(m_OutputFormat);
m_OutputFormat := outputFormat;
end else
begin
FreeAndNil(m_OutputFormat);
m_OutputFormat := nil;
end;
ClearQueue();
FreeAndNil(m_OutputQueue);
m_OutputQueue := TQueue.Create;
end;

procedure TDMDiscretize.setFindNumBins(newFindNumBins : boolean);
begin
m_FindNumBins := newFindNumBins;
end;

procedure TDMDiscretize.ClearQueue();
begin
inherited ClearQueue();
end;

destructor TDMDiscretize.Destroy;
begin
inherited Destroy;

FreeAndNil(m_DiscretizeCols);
SetLength(m_CutPoints,0,0);
SetLength(m_ManualCutPoints, 0);

end;

end.
Соседние файлы в папке FilterModule