Download Multidimensional Scaling

Survey
yes no Was this document useful for you?
   Thank you for your participation!

* Your assessment is very important for improving the work of artificial intelligence, which forms the content of this project

Transcript
Multidimensional Scaling
1. Classical Metric MDS
1.a. European Cities Data (from book)
cities.dist=read.table("http://www.uidaho.edu/~stevel/519/Data/CITIES.txt"
,row.names=1)
cities.dist
V2
V3
V4
V5
V6
V7
V8
Athens
0 1119 1777 1486 1475 1303 646
Berlin 1119
0 817 577 1159 545 736
Dublin 1777 817
0 291 906 489 1182
London 1486 577 291
0 783 213 897
Madrid 1475 1159 906 783
0 652 856
Paris 1303 545 489 213 652
0 694
Rome
646 736 1182 897 856 694
0
Warsaw 1013 327 1135 904 1483 859 839
cities = rownames(cities.dist)
colnames(cities.dist) = cities
cities.dist
Athens Berlin Dublin London Madrid
Athens
0
1119
1777
1486
1475
Berlin
1119
0
817
577
1159
Dublin
1777
817
0
291
906
London
1486
577
291
0
783
Madrid
1475
1159
906
783
0
Paris
1303
545
489
213
652
Rome
646
736
1182
897
856
Warsaw
1013
327
1135
904
1483
cities.mds = cmdscale(cities.dist, k=2)
plot(cities.mds)
identify(cities.mds, labels=cities)
[1] 1 2 3 4 5 6 7 8
Compare to the map of Europe.
V9
1013
327
1135
904
1483
859
839
0
Paris Rome Warsaw
1303 646
1013
545 736
327
489 1182
1135
213 897
904
652 856
1483
0 694
859
694
0
839
859 839
0
1.b. More European Cities Data
eurodist
Barcelona
Brussels
Calais
Cherbourg
Cologne
Copenhagen
Geneva
Gibraltar
Hamburg
Hook of Holland
Lisbon
Lyons
Madrid
Marseilles
Milan
Munich
Paris
Rome
Stockholm
Vienna
Athens Barcelona Brussels Calais Cherbourg Cologne Copenhagen
3313
2963
1318
3175
1326
204
3339
1294
583
460
2762
1498
206
409
785
3276
2218
966
1136
1545
760
2610
803
677
747
853
1662
1418
4485
1172
2256
2224
2047
2436
3196
2977
2018
597
714
1115
460
460
3030
1490
172
330
731
269
269
4532
1305
2084
2052
1827
2290
2971
2753
645
690
739
789
714
1458
3949
636
1558
1550
1347
1764
2498
2865
521
1011
1059
1101
1035
1778
2282
1014
925
1077
1209
911
1537
2179
1365
747
977
1160
583
1104
3000
1033
285
280
340
465
1176
817
1460
1511
1662
1794
1497
2050
3927
2868
1616
1786
2196
1403
650
1991
1802
1175
1381
1588
937
1455
euro.mds = cmdscale(eurodist, k=2)
plot(euro.mds, type="n")
text(euro.mds, rownames(euro.mds))
euro.mds[,2] = -1*euro.mds[,2]
plot(euro.mds, type="n")
text(euro.mds, rownames(euro.mds))
1.c. State Presidential Voting Patterns 1960-1976
library(cluster)
data(votes.repub)
repub = votes.repub[,27:31]
head(repub)
X1960 X1964 X1968 X1972 X1976
Alabama
41.75 69.5 14.0 72.4 43.48
Alaska
50.94 34.1 45.3 58.1 62.91
Arizona
55.52 50.4 54.8 64.7 58.62
Arkansas
43.06 43.9 30.8 68.9 34.97
California 50.10 40.9 47.8 55.0 50.89
Colorado
54.63 38.7 50.5 62.6 55.89
pairs(repub)
repub.dist = dist(repub)
# create a distance matrix
repub.mds = cmdscale(repub.dist, k=2)
plot(repub.mds, type="n")
text(repub.mds, state.abb)
Note that metric MDS on Euclidean distance is equivalent to PCA (up to reflection).
repub.pca = prcomp(repub)
plot(-repub.pca$x[,1],-repub.pca$x[,2], type="n")
text(-repub.pca$x[,1],-repub.pca$x[,2], state.abb)
text(repub.mds, state.abb, col="red",cex=.9)
20
NE
0
-10
-repub.pca$x[, 2]
10
UTID
IDAZ
AZ
KS
WY
WY
OK
OK
IN
ND IN
SD
CO
CO
VA FL
VT NH
VT
NMNV
MT
IA MT
IA
IL
IL
AK CA
TN
NC
OH
OH
WA
WA
ORWI
WI
KY
DE
NJ
MO
MO
ME
ME
TX
PA
PA
CT
MI MD
MI
MD
NY
MN
MN
WV
WV
MS
MS
SC
AL
AL
GA
LA
LA
AR
AR
-30
-20
HI
MA
MA
RI
RI
0
20
40
60
-repub.pca$x[, 1]
Convince yourself that metric MDS on non-Euclidean distance is not equivalent to PCA
by trying the following:
20
pvalue = 2.5
repub.dist = dist(repub,method='minkowski',p=pvalue)
repub.mds = cmdscale(repub.dist, k=2)
plot(-repub.pca$x[,1],-repub.pca$x[,2], type="n")
text(-repub.pca$x[,1],-repub.pca$x[,2], state.abb)
text(repub.mds, state.abb, col="red",cex=.9)
NE
NE
UTIDAZ
UTIDAZ
KS
WY
0
-10
OK
ND
ND ININOK
SC
SC
SDSD
FL
CO
CO
VA FL
VTVTNH
NH
VA
MT
NMNV
NM
IA IA MT
NV
AK IL
IL
CA
AK CA
TN
NC
OH
TN
NC
ORWI
OH
WA
DE
KY
NJMO
ORME
KY
WIWA
DE
MO
ME NJ
TX
TX
PA
CT
MI
PA
CT
NY
MI MD
MD
NYMN
WV
AR
MN
WV
AR
MS
MS
AL
AL
GA
GA
LA
LA
HI
-20
HI
MA
RI
-30
-repub.pca$x[, 2]
10
KS
WY
MA
RI
0
20
-repub.pca$x[, 1]
40
60
2. Nonmetric MDS
With nonmetric MDS, we don’t try to match the distances precisely, only to get them in
the right order.
2.a. Car dissimilarity data
car.dis=read.tri("CAR.txt")
Read 55 items
rownames(car.dis) = c("BMW","Ford","Infiniti","Jeep","Lexus",
"Chrysler","Mercedes","Saab","Porsche","Volvo")
colnames(car.dis) = rownames(car.dis)
car.dis
BMW
Ford
Infiniti
Jeep
Lexus
Chrysler
Mercedes
Saab
Porsche
Volvo
BMW Ford Infiniti Jeep Lexus Chrysler Mercedes Saab Porsche Volvo
0
34
8
31
7
43
3
10
6
33
34
0
24
2
26
14
28
18
39
11
8
24
0
25
1
35
5
20
41
22
31
2
25
0
27
15
29
17
38
12
7
26
1
27
0
37
4
13
40
23
43
14
35
15
37
0
42
36
45
9
3
28
5
29
4
42
0
19
32
30
10
18
20
17
13
36
19
0
21
16
6
39
41
38
40
45
32
21
0
44
33
11
22
12
23
9
30
16
44
0
library(MASS)
car.nmds = isoMDS(car.dis, k=2)
initial value 10.466056
iter
5 value 6.523088
iter 10 value 4.865126
iter 15 value 4.088399
iter 20 value 4.003118
final value 3.989006
converged
plot(car.nmds$points)
identify(car.nmds$points, labels=rownames(car.dis))
[1] 1 2 3 4 5 6 7 8 9 10
car.nmds$stress
[1] 3.989006
2.b. Shepard Plot
The Shepard plot shows how well the projected distances match those in the distance
(dissimilarity) matrix.
delta = as.dist(car.dis)
delta
BMW Ford Infiniti Jeep Lexus Chrysler Mercedes Saab Porsche
Ford
34
Infiniti
8
24
Jeep
31
2
25
Lexus
7
26
1
27
Chrysler 43
14
35
15
37
Mercedes
3
28
5
29
4
42
Saab
10
18
20
17
13
36
19
Porsche
6
39
41
38
40
45
32
21
Volvo
33
11
22
12
23
9
30
16
44
d = dist(car.nmds$points)
d
Ford
Infiniti
Jeep
Lexus
Chrysler
Mercedes
Saab
Porsche
Volvo
Saab
Porsche
Volvo
BMW
32.8060844
18.1377177
32.6456779
18.0181026
47.7686100
10.3618904
14.9538173
19.9019022
33.5204834
Mercedes
18.9017790
30.2506671
29.7370576
Ford
Infiniti
Jeep
Lexus
Chrysler
29.7624491
0.1622511
30.0796050
19.7866165
34.1466512
18.0201463
37.7507222
16.3158408
Saab
29.6418391
0.3569870
37.3082959
9.6963302
19.2685742
36.9161416
21.5573819
Porsche
29.9583991
19.9064684
34.0012532
17.8581765
37.6016077
16.3076783
37.6642511
9.4414663
19.4326852
36.8838082
21.9131138
45.2285751
34.8212118
56.7975285
15.7526137
23.0762257
22.8928864 45.9630773
plot(d, delta, pch=16)
car.sh = Shepard(delta, car.nmds$points)
plot(car.sh$y, car.sh$x, pch=16)
# Book's orientation
points(car.sh$yf, car.sh$x, pch=3,col="red")
2.c. Drug Use Data – Starting with Similarities
drug.cor=read.tri("DRUG_USE.txt")
Read 91 items
rownames(drug.cor) = c("Cigarettes","Beer","Wine","Liquor","Cocaine",
"Tranquillizers","Medication","Heroin","Marijuana","Hashish",
"Inhalants","Hallucinogenics","Amphetamines")
colnames(drug.cor) = rownames(drug.cor)
head(drug.cor)
Cigarettes Beer Wine Liquor Cocaine Tranquillizers Medication
Cigarettes
1.000 0.447 0.422 0.435
0.114
0.203
0.091
Beer
0.447 1.000 0.619 0.604
0.068
0.146
0.103
Wine
0.422 0.619 1.000 0.583
0.053
0.139
0.110
Liquor
0.435 0.604 0.583 1.000
0.115
0.258
0.122
Cocaine
0.114 0.068 0.053 0.115
1.000
0.349
0.209
Tranquillizers
0.203 0.146 0.139 0.258
0.349
1.000
0.221
Heroin Marijuana Hashish Inhalants Hallucinogenics Amphetamines
Cigarettes
0.082
0.513
0.304
0.245
0.101
0.245
Beer
0.063
0.445
0.318
0.203
0.088
0.199
Wine
0.066
0.365
0.240
0.183
0.074
0.184
Liquor
0.097
0.482
0.368
0.255
0.139
0.293
Cocaine
0.321
0.186
0.303
0.272
0.279
0.278
Tranquillizers 0.355
0.315
0.377
0.323
0.367
0.545
A correlation matrix is an example of a similarity matrix, where larger values represent
more similar (closer) cases, rather than more distant ones. To do NMDS, we need a
distance matrix. Since only order matters, we simply need to find a transformation that
reverses the order of values, while putting 0s in on the diagonal.
For a matrix such as a correlation matrix, where the diagonal values are all the same and
the largest value in the matrix, we can simply subtract each element from that diagonal
value.
drug.dist = 1-drug.cor
head(drug.dist)
Cigarettes Beer
Cigarettes
0.000 0.553
Beer
0.553 0.000
Wine
0.578 0.381
Liquor
0.565 0.396
Cocaine
0.886 0.932
Tranquillizers
0.797 0.854
Heroin Marijuana
Cigarettes
0.918
0.487
Beer
0.937
0.555
Wine
0.934
0.635
Liquor
0.903
0.518
Cocaine
0.679
0.814
Tranquillizers 0.645
0.685
Wine Liquor Cocaine Tranquillizers Medication
0.578 0.565
0.886
0.797
0.909
0.381 0.396
0.932
0.854
0.897
0.000 0.417
0.947
0.861
0.890
0.417 0.000
0.885
0.742
0.878
0.947 0.885
0.000
0.651
0.791
0.861 0.742
0.651
0.000
0.779
Hashish Inhalants Hallucinogenics Amphetamines
0.696
0.755
0.899
0.755
0.682
0.797
0.912
0.801
0.760
0.817
0.926
0.816
0.632
0.745
0.861
0.707
0.697
0.728
0.721
0.722
0.623
0.677
0.633
0.455
drug.nmds = isoMDS(drug.dist, k=2)
initial value 15.940769
iter
5 value 9.794369
iter 10 value 8.970287
final value 8.941970
converged
plot(drug.nmds$points)
identify(drug.nmds$points, labels=rownames(drug.dist))
[1] 1 2 3 4 5 6 7 8 9 10 11 12 13
drug.sh = Shepard(as.dist(drug.dist), drug.nmds$points)
plot(drug.sh$y, drug.sh$x, pch=16)
points(drug.sh$yf, drug.sh$x, pch=3,col="red")
A more general approach to converting a similarity matrix to a dissimilarity matrix uses
the formula d rs  crr  2crs  css , where crs is the element in the rth row and sth column
of the similarity matrix, and d rs is the same for the dissimilarity matrix. This ensures that
the diagonal elements of the dissimilarity matrix will all be 0, even if the diagonals of the
similarity matrix vary.
drug.dist2 =matrix(0, nrow=13, ncol=13)
for (i in 1:13) for (j in 1:13) drug.dist2[i,j] = sqrt(drug.cor[i,i]2*drug.cor[i,j]+drug.cor[j,j])
plot(drug.dist, drug.dist2)
Since the dissimilarity matrices are essentially the same, they lead to very similar
solutions.
drug.nmds2 = isoMDS(drug.dist2, k=2)
initial value 18.198520
iter
5 value 9.087638
final value 8.874961
converged
plot(drug.nmds2$points)
identify(drug.nmds2$points, labels=rownames(drug.dist))
[1] 1 2 3 4 5 6 7 8 9 10 11 12 13
drug.sh2 = Shepard(as.dist(drug.dist), drug.nmds2$points)
plot(drug.sh2$y, drug.sh2$x, pch=16)
points(drug.sh2$yf, drug.sh2$x, pch=3,col="red")
# Multidimensional Scaling (MDS)
# From your book, a set of distances between European cities
cities.dist <- read.table("CITIES.txt", row.names=1)
cities.dist
cities <- rownames(cities.dist)
colnames(cities.dist) <- cities
cities.dist
cities.mds <- cmdscale(cities.dist, k=2)
plot(cities.mds)
identify(cities.mds, labels=cities)
# A similar example from R
eurodist
euro.mds <- cmdscale(eurodist, k=2)
plot(euro.mds, type="n")
text(euro.mds, rownames(euro.mds))
euro.mds[,2] <- -1*euro.mds[,2]
plot(euro.mds, type="n")
text(euro.mds, rownames(euro.mds))
euro.sh <- Shepard(as.dist(eurodist),euro.mds)
plot(euro.sh$y, euro.sh$x, pch=16)
# Book's orientation
points(euro.sh$yf, euro.sh$x, pch=3,col="red")
# Examine States by Voting Patterns, 1960-1976
library(cluster)
data(votes.repub)
repub <- votes.repub[,27:31]
head(repub)
pairs(repub)
repub.dist <- dist(repub)
# create a distance matrix
repub.mds <- cmdscale(repub.dist, k=2)
plot(repub.mds, type="n")
text(repub.mds, state.abb)
repub.pca <- prcomp(repub)
plot(-repub.pca$x[,1],repub.pca$x[,2], type="n")
text(-repub.pca$x[,1],repub.pca$x[,2], state.abb)
text(repub.mds, state.abb, col="red")
# Nonmetric MDS - Car Dissimilarity Data
library(MASS)
# Needed for nonmetric MDS
car.dis <- read.tri("CAR.txt")
rownames(car.dis) <- c("BMW","Ford","Infiniti","Jeep","Lexus",
"Chrysler","Mercedes","Saab","Porsche","Volvo")
colnames(car.dis) <- rownames(car.dis)
car.dis
car.nmds <- isoMDS(car.dis, k=2)
plot(car.nmds$points)
identify(car.nmds$points, labels=rownames(car.dis))
car.nmds$stress
# Shepard diagram
delta <- as.dist(car.dis)
delta
d <- dist(car.nmds$points)
d
plot(d, delta, pch=16)
car.sh <- Shepard(delta, car.nmds$points)
plot(car.sh$y, car.sh$x, pch=16)
points(car.sh$yf, car.sh$x, pch=3,col="red")
# Book's orientation
# Drug Use Data - Correlations as Variable Similarities
drug.cor <- read.tri("DRUG_USE.txt")
rownames(drug.cor) <- c("Cigarettes","Beer","Wine","Liquor","Cocaine",
"Tranquillizers","Medication","Heroin","Marijuana","Hashish",
"Inhalants","Hallucinogenics","Amphetamines")
colnames(drug.cor) <- rownames(drug.cor)
head(drug.cor)
# Convert to distances - method 1 (works well for correlations)
drug.dist <- 1-drug.cor
head(drug.dist)
drug.nmds <- isoMDS(drug.dist, k=2)
plot(drug.nmds$points)
identify(drug.nmds$points, labels=rownames(drug.dist))
drug.sh <- Shepard(as.dist(drug.dist), drug.nmds$points)
plot(drug.sh$y, drug.sh$x, pch=16)
points(drug.sh$yf, drug.sh$x, pch=3,col="red")
# Convert to distances - method 2 (more general)
drug.dist2 <-matrix(0, nrow=13, ncol=13)
for (i in 1:13) for (j in 1:13)
drug.dist2[i,j] <- sqrt(drug.cor[i,i]2*drug.cor[i,j]+drug.cor[j,j])
plot(drug.dist, drug.dist2)
drug.nmds2 <- isoMDS(drug.dist2, k=2)
plot(drug.nmds2$points)
identify(drug.nmds2$points, labels=rownames(drug.dist))
drug.sh2 <- Shepard(as.dist(drug.dist), drug.nmds2$points)
plot(drug.sh2$y, drug.sh2$x, pch=16)
points(drug.sh2$yf, drug.sh2$x, pch=3,col="red")
Similar