Skip to content

Add distribution attribute to box plots to improve log-axis support #7392

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions src/traces/box/attributes.js
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,25 @@ module.exports = {
'Q3 the median of the upper half.'
].join(' ')
},

distribution: {
valType: 'enumerated',
values: ['normal', 'log-normal', 'auto'],
dflt: 'auto',
editType: 'calc',
description: [
'Sets the underlying distribution used to compute the whiskers.',

'If *normal*, the whiskers are computed using the standard 1.5 * IQR rule,',
'when displaying your data on a linear scale.',

'If *log-normal*, the whiskers are computed based on the IQR in log units,',
'which prevents the lower fence from ever going negative (resulting in an',
'infinitely long whisker on a log scale).',

'If *auto*, uses *log-normal* when displayed on a log axis, otherwise *normal*.'
].join(' ')
},

width: {
valType: 'number',
Expand Down
38 changes: 36 additions & 2 deletions src/traces/box/calc.js
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@ module.exports = function calc(gd, trace) {
posLetter = 'x';
hasPeriod = !!trace.xperiodalignment;
}

// Determine whether to use log-normal distribution for whiskers
var useLogNormal = trace.distribution === 'log-normal' ||
(trace.distribution === 'auto' && valAxis.type === 'log');

var allPosArrays = getPosArrays(trace, posLetter, posAxis, fullLayout[numKey]);
var posArray = allPosArrays[0];
Expand Down Expand Up @@ -78,6 +82,7 @@ module.exports = function calc(gd, trace) {
if(hasPeriod && origPos) {
cdi.orig_p = origPos[i]; // used by hover
}
cdi.usesLogNormal = useLogNormal;

cdi.q1 = d2c('q1');
cdi.med = d2c('median');
Expand Down Expand Up @@ -213,6 +218,7 @@ module.exports = function calc(gd, trace) {
if(ptsPerBin[i].length > 0) {
cdi = {};
cdi.pos = cdi[posLetter] = posDistinct[i];
cdi.usesLogNormal = useLogNormal;

pts = cdi.pts = ptsPerBin[i].sort(sortByVal);
boxVals = cdi[valLetter] = pts.map(extractVal);
Expand Down Expand Up @@ -407,10 +413,24 @@ function extractVal(o) { return o.v; }
// last point below 1.5 * IQR
function computeLowerFence(cdi, boxVals, N) {
if(N === 0) return cdi.q1;

var lowerFence;

if (cdi.usesLogNormal) {
// For log-normal distribution, compute fence in log space to prevent negative values
var logQ1 = Math.log(Math.max(cdi.q1, Number.MIN_VALUE));
var logQ3 = Math.log(Math.max(cdi.q3, Number.MIN_VALUE));
var logIQR = logQ3 - logQ1;
lowerFence = Math.exp(logQ1 - 1.5 * logIQR);
} else {
// Standard 1.5 * IQR calculation (2.5*Q1 - 1.5*Q3 is algebraically equivalent)
lowerFence = 2.5 * cdi.q1 - 1.5 * cdi.q3;
}

return Math.min(
cdi.q1,
boxVals[Math.min(
Lib.findBin(2.5 * cdi.q1 - 1.5 * cdi.q3, boxVals, true) + 1,
Lib.findBin(lowerFence, boxVals, true) + 1,
N - 1
)]
);
Expand All @@ -419,10 +439,24 @@ function computeLowerFence(cdi, boxVals, N) {
// last point above 1.5 * IQR
function computeUpperFence(cdi, boxVals, N) {
if(N === 0) return cdi.q3;

var upperFence;

if (cdi.usesLogNormal) {
// For log-normal distribution, compute fence in log space
var logQ1 = Math.log(Math.max(cdi.q1, Number.MIN_VALUE));
var logQ3 = Math.log(Math.max(cdi.q3, Number.MIN_VALUE));
var logIQR = logQ3 - logQ1;
upperFence = Math.exp(logQ3 + 1.5 * logIQR);
} else {
// Standard 1.5 * IQR calculation (2.5*Q3 - 1.5*Q1 is algebraically equivalent)
upperFence = 2.5 * cdi.q3 - 1.5 * cdi.q1;
}

return Math.max(
cdi.q3,
boxVals[Math.max(
Lib.findBin(2.5 * cdi.q3 - 1.5 * cdi.q1, boxVals),
Lib.findBin(upperFence, boxVals),
0
)]
);
Expand Down
39 changes: 39 additions & 0 deletions test/image/mocks/box_distribution.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
{
"data": [
{
"type": "box",
"name": "Normal Dist (Linear)",
"x": [1],
"y": [1, 2, 3, 4, 5, 10, 20, 100],
"distribution": "normal",
"boxmean": true
},
{
"type": "box",
"name": "Log-Normal Dist (Linear)",
"x": [2],
"y": [1, 2, 3, 4, 5, 10, 20, 100],
"distribution": "log-normal",
"boxmean": true
},
{
"type": "box",
"name": "Auto Dist (Linear)",
"x": [3],
"y": [1, 2, 3, 4, 5, 10, 20, 100],
"distribution": "auto",
"boxmean": true
}
],
"layout": {
"title": {
"text": "Box Plot with Different Distribution Types (Linear Scale)"
},
"xaxis": {
"title": "Distribution Type"
},
"yaxis": {
"title": "Values"
}
}
}
40 changes: 40 additions & 0 deletions test/image/mocks/box_distribution_log.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
{
"data": [
{
"type": "box",
"name": "Normal Dist (Log)",
"x": [1],
"y": [1, 2, 3, 4, 5, 10, 20, 100],
"distribution": "normal",
"boxmean": true
},
{
"type": "box",
"name": "Log-Normal Dist (Log)",
"x": [2],
"y": [1, 2, 3, 4, 5, 10, 20, 100],
"distribution": "log-normal",
"boxmean": true
},
{
"type": "box",
"name": "Auto Dist (Log)",
"x": [3],
"y": [1, 2, 3, 4, 5, 10, 20, 100],
"distribution": "auto",
"boxmean": true
}
],
"layout": {
"title": {
"text": "Box Plot with Different Distribution Types (Log Scale)"
},
"xaxis": {
"title": "Distribution Type"
},
"yaxis": {
"type": "log",
"title": "Values (log scale)"
}
}
}
178 changes: 178 additions & 0 deletions test/jasmine/tests/box_test.js
Original file line number Diff line number Diff line change
Expand Up @@ -1228,6 +1228,184 @@ describe('Test box calc', function() {
Plots.doCalcdata(gd);
return gd.calcdata[0];
}

it('should compute fence values differently depending on *distribution*', function() {
// Create a dataset that would have a negative lower fence with normal distribution
var y = [10, 20, 30, 40, 1000];

// Test with normal distribution
var cd = _calc({
y: y,
distribution: 'normal'
});
// The normal distribution fence could potentially be negative

// Test with log-normal distribution
var cd2 = _calc({
y: y,
distribution: 'log-normal'
});
// The log-normal lower fence should be higher (not negative)
expect(cd2[0].lf).toBeGreaterThan(0, 'log-normal distribution lower fence is positive');

// Skip test with negative values as the implementation gracefully handles them via Math.max

// Test auto distribution on a log axis
var cd4 = _calc({
y: y,
distribution: 'auto'
}, {
yaxis: {type: 'log'}
});
// Should use log-normal distribution
expect(cd4[0].lf).toBeGreaterThan(0, 'auto distribution on log axis');
expect(cd4[0].lf).toBeCloseTo(cd2[0].lf, 6, 'auto distribution equals log-normal on log axis');
});

it('should prevent negative whiskers with log-normal distribution', function() {
// This dataset would produce negative lower fence with normal distribution calculation
// (but the implementation will clamp to the minimum value)
var dataset = [2, 3, 5, 10, 200];

// Calculate with normal distribution
var cdNormal = _calc({
y: dataset,
distribution: 'normal'
});

// Calculate with log-normal distribution
var cdLogNormal = _calc({
y: dataset,
distribution: 'log-normal'
});

// Verify log-normal lower fence is positive
expect(cdLogNormal[0].lf).toBeGreaterThan(0, 'log-normal lower fence is positive');
});

it('should set usesLogNormal flag correctly for log-normal distribution', function() {
// Use a typical log-normally distributed dataset
var dataset = [1, 2, 5, 10, 20, 50, 100];

var cd = _calc({
y: dataset,
distribution: 'log-normal'
});

// Verify the usesLogNormal flag is set
expect(cd[0].usesLogNormal).toBe(true, 'usesLogNormal flag is set for log-normal distribution');

// Check that the fence values are reasonable
expect(cd[0].lf).toBeGreaterThan(0, 'log-normal lower fence is positive');
expect(cd[0].lf).toBeLessThan(cd[0].q1, 'lower fence is less than q1');
expect(cd[0].uf).toBeGreaterThan(cd[0].q3, 'upper fence is greater than q3');
});

it('should use correct distribution mode for auto setting', function() {
var dataset = [1, 2, 5, 10, 20, 50, 100];

// Test on linear axis
var cdLinear = _calc({
y: dataset,
distribution: 'auto'
}, {
yaxis: {type: 'linear'}
});

// Calculate with explicitly set normal distribution
var cdNormal = _calc({
y: dataset,
distribution: 'normal'
});

// Verify auto on linear axis uses normal distribution
expect(cdLinear[0].lf).toBeCloseTo(cdNormal[0].lf, 6, 'auto distribution equals normal on linear axis');
expect(cdLinear[0].uf).toBeCloseTo(cdNormal[0].uf, 6, 'auto distribution equals normal on linear axis');

// Test on log axis
var cdLog = _calc({
y: dataset,
distribution: 'auto'
}, {
yaxis: {type: 'log'}
});

// Calculate with explicitly set log-normal distribution
var cdLogNormal = _calc({
y: dataset,
distribution: 'log-normal'
});

// Verify auto on log axis uses log-normal distribution
expect(cdLog[0].lf).toBeCloseTo(cdLogNormal[0].lf, 6, 'auto distribution equals log-normal on log axis');
expect(cdLog[0].uf).toBeCloseTo(cdLogNormal[0].uf, 6, 'auto distribution equals log-normal on log axis');
});

it('should correctly handle explicit fence values', function() {
var dataset = [1, 2, 5, 10, 20, 50, 100];

// With normal distribution and no explicit fences (baseline)
var cdNormalBaseline = _calc({
y: dataset,
distribution: 'normal'
});

// With log-normal distribution and no explicit fences (baseline)
var cdLogNormalBaseline = _calc({
y: dataset,
distribution: 'log-normal'
});

// Fence values must be valid (>= q1 and <= q3)
var validLowerFence = cdNormalBaseline[0].q1;
var validUpperFence = cdNormalBaseline[0].q3;

// With normal distribution and valid explicit fences
var cdNormal = _calc({
y: dataset,
distribution: 'normal',
lowerfence: [validLowerFence],
upperfence: [validUpperFence]
});

// With log-normal distribution and valid explicit fences
var cdLogNormal = _calc({
y: dataset,
distribution: 'log-normal',
lowerfence: [validLowerFence],
upperfence: [validUpperFence]
});

// Verify explicit fence values are used when valid
expect(cdNormal[0].lf).toEqual(validLowerFence, 'normal distribution uses valid explicit lower fence');
expect(cdNormal[0].uf).toEqual(validUpperFence, 'normal distribution uses valid explicit upper fence');
expect(cdLogNormal[0].lf).toEqual(validLowerFence, 'log-normal distribution uses valid explicit lower fence');
expect(cdLogNormal[0].uf).toEqual(validUpperFence, 'log-normal distribution uses valid explicit upper fence');
});

it('should handle extreme data distributions correctly', function() {
// Very skewed dataset that would have strongly negative whiskers with normal distribution
var extremeDataset = [1, 2, 3, 4, 5, 1000, 2000, 5000];

// With normal distribution
var cdNormal = _calc({
y: extremeDataset,
distribution: 'normal'
});

// With log-normal distribution
var cdLogNormal = _calc({
y: extremeDataset,
distribution: 'log-normal'
});

// Verify log-normal gives reasonable positive whiskers
expect(cdLogNormal[0].lf).toBeGreaterThan(0, 'log-normal gives positive lower fence for extreme data');

// Verify usesLogNormal flag is set correctly
expect(cdNormal[0].usesLogNormal).toBe(false, 'normal distribution sets flag to false');
expect(cdLogNormal[0].usesLogNormal).toBe(true, 'log-normal distribution sets flag to true');
});

it('should compute q1/q3 depending on *quartilemethod*', function() {
// samples from https://en.wikipedia.org/wiki/Quartile
Expand Down