"""Data sets for machine learning problems. (Chapters 18-21)."""
from __future__ import nested_scopes
import utils
from learning import *
import random
def RestaurantDataSet(examples):
"Build a DataSet of Restaurant waiting examples."
return DataSet(name='Restaurant', target='Wait', examples=examples,
attrnames='Alternate Bar Fri/Sat Hungry Patrons Price '
+ 'Raining Reservation Type WaitEstimate Wait',
doc='Data from AIMA [Fig. 18.5]')
restaurant = RestaurantDataSet("""
Yes No No Yes Some $$$ No Yes French 0-10 Yes
Yes No No Yes Full $ No No Thai 30-60 No
No Yes No No Some $ No No Burger 0-10 Yes
Yes No Yes Yes Full $ No No Thai 10-30 Yes
Yes No Yes No Full $$$ No Yes French >60 No
No Yes No Yes Some $$ Yes Yes Italian 0-10 Yes
No Yes No No None $ Yes No Burger 0-10 No
No No No Yes Some $$ Yes Yes Thai 0-10 Yes
No Yes Yes No Full $ Yes No Burger >60 No
Yes Yes Yes Yes Full $$$ No Yes Italian 10-30 No
No No No No None $ No No Thai 0-10 No
Yes Yes Yes Yes Full $ No No Burger 30-60 Yes""")
def SyntheticRestaurant(n=20):
"Generate a DataSet with n examples."
def T(attrname, branches):
return DecisionTree(restaurant.attrnum(attrname), attrname, branches)
tree = T('Patrons',
{'None': 'No', 'Some': 'Yes', 'Full':
T('WaitEstimate',
{'>60': 'No', '0-10': 'Yes', '30-60':
T('Alternate', {'No':
T('Reservation', {'Yes': 'Yes', 'No':
T('Bar', {'No':'No',
'Yes':'Yes'})}),
'Yes':
T('Fri/Sat', {'No': 'No', 'Yes': 'Yes'})}),
'10-30':
T('Hungry', {'No': 'Yes', 'Yes':
T('Alternate',
{'No': 'Yes', 'Yes':
T('Raining', {'No': 'No', 'Yes': 'Yes'})})})})})
def gen():
example = map(random.choice, restaurant.values)
example[restaurant.target] = tree.predict(example)
return example
return RestaurantDataSet([gen() for i in range(n)])
orings = DataSet(name='O-Rings',
attrnames="Rings Distressed Temp Pressure Flightnum", target='Distressed',
examples="""
6 0 66 50 1
6 1 70 50 2
6 0 69 50 3
6 0 68 50 4
6 0 67 50 5
6 0 72 50 6
6 0 73 100 7
6 0 70 100 8
6 1 57 200 9
6 1 63 200 10
6 1 70 200 11
6 0 78 200 12
6 0 67 200 13
6 2 53 200 14
6 0 67 200 15
6 0 75 200 16
6 0 70 200 17
6 0 81 200 18
6 0 76 200 19
6 0 79 200 20
6 0 75 200 21
6 0 76 200 22
6 1 58 200 23""",
source="http://www1.ics.uci.edu/pub/machine-learning-databases/space-shuttle/",
doc="""1. Title: Challenger Space Shuttle O-Ring Data (2 databases)
2. Sources:
-- David Draper (draper@math.ucla.edu)
University of California, Los Angeles
-- Donor: David Draper (draper@math.ucla.edu)
-- Date: 5 August 1993
3. Past Usage:
1. Draper,~D. (1993). Assessment and propagation of model uncertainty.
In {\it Proceedings of the Fourth International Workshop on Artificial
Intelligence and Statistics} (pp. 497--509). Ft. Lauderdale, FL:
Unpublished.
-- Discrete model uncertainty analysis
-- Analysis suggests that obvious different extrapolations of the
data exist at 31 degrees Fahrenheit (i.e., freezing), which sharply
discredits the assumption of no temperature effect.
2. Dalal,~S.~R., Fowlkes,~E.~B., \& Hoadley,~B. (1989). Risk analysis of
the space shuttle: pre-Challenger prediction of failure. {\it Journal
of the American Statisticians Association}, {\it 84}, 945--957.
3. Lavine,~M. (1991). Problems in extrapolation illustrated with space
shuttle O-ring data. {\it Journal of the American Statisticians
Association}, {\it 86}, 919--922.
4. Martz~H.~F., \& Zimmer,~W.~J. (1992). The risk of catastrophic failure
of the solid rocket boosters on the space shuttle. {\it American
Statistics}, {\it 46}, 42--47.
4. Number of instances: 23 in each of two files
5. Relevant Information:
There are two databases: (both use the same set of 5 attributes)
1. Primary o-ring erosion and/or blowby
2. Primary o-ring erosion only
The two databases are identical except for the 2nd attribute of the
21st instance (confirmed by David Draper on 8/5/93).
Edited from (Draper, 1993):
The motivation for collecting this database was the explosion of the
USA Space Shuttle Challenger on 28 January, 1986. An investigation
ensued into the reliability of the shuttle's propulsion system. The
explosion was eventually traced to the failure of one of the three field
joints on one of the two solid booster rockets. Each of these six field
joints includes two O-rings, designated as primary and secondary, which
fail when phenomena called erosion and blowby both occur.
The night before the launch a decision had to be made regarding
launch safety. The discussion among engineers and managers leading to
this decision included concern that the probability of failure of the
O-rings depended on the temperature t at launch, which was forecase to
be 31 degrees F. There are strong engineering reasons based on the
composition of O-rings to support the judgment that failure
probability may rise monotonically as temperature drops. One other
variable, the pressure s at which safety testing for field join leaks
was performed, was available, but its relevance to the failure process
was unclear.
Draper's paper includes a menacing figure graphing the number of field
joints experiencing stress vs. liftoff temperature for the 23 shuttle
flights previous to the Challenger disaster. No previous liftoff
temperature was under 53 degrees F. Although tremendous extrapolation
must be done from the given data to assess risk at 31 degrees F, it
is obvious even to the layman "to foresee the unacceptably high risk
created by launching at 31 degrees F." For more information, see
Draper (1993) or the other previous analyses.
The task is to predict the number of O-rings that will experience
thermal distress for a given flight when the launch temperature is
below freezing.
6. Number of Attributes: 5
1. Number of O-rings at risk on a given flight
2. Number experiencing thermal distress
3. Launch temperature (degrees F)
4. Leak-check pressure (psi)
5. Temporal order of flight
7. Attribute Information: all values are positive integers""")
zoo = DataSet(name='Zoo', target='type', exclude=['name'],
attrnames="""name hair feathers eggs milk airborne aquatic predator toothed backbone breathes venomous fins legs tail domestic catsize type""",
examples="""
aardvark,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,mammal
antelope,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,mammal
bass,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,fish
bear,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,mammal
boar,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,mammal
buffalo,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,mammal
calf,1,0,0,1,0,0,0,1,1,1,0,0,4,1,1,1,mammal
carp,0,0,1,0,0,1,0,1,1,0,0,1,0,1,1,0,fish
catfish,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,fish
cavy,1,0,0,1,0,0,0,1,1,1,0,0,4,0,1,0,mammal
cheetah,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,mammal
chicken,0,1,1,0,1,0,0,0,1,1,0,0,2,1,1,0,bird
chub,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,fish
clam,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,shellfish
crab,0,0,1,0,0,1,1,0,0,0,0,0,4,0,0,0,shellfish
crayfish,0,0,1,0,0,1,1,0,0,0,0,0,6,0,0,0,shellfish
crow,0,1,1,0,1,0,1,0,1,1,0,0,2,1,0,0,bird
deer,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,mammal
dogfish,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,1,fish
dolphin,0,0,0,1,0,1,1,1,1,1,0,1,0,1,0,1,mammal
dove,0,1,1,0,1,0,0,0,1,1,0,0,2,1,1,0,bird
duck,0,1,1,0,1,1,0,0,1,1,0,0,2,1,0,0,bird
elephant,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,mammal
flamingo,0,1,1,0,1,0,0,0,1,1,0,0,2,1,0,1,bird
flea,0,0,1,0,0,0,0,0,0,1,0,0,6,0,0,0,insect
frog,0,0,1,0,0,1,1,1,1,1,0,0,4,0,0,0,amphibian
frog,0,0,1,0,0,1,1,1,1,1,1,0,4,0,0,0,amphibian
fruitbat,1,0,0,1,1,0,0,1,1,1,0,0,2,1,0,0,mammal
giraffe,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,mammal
girl,1,0,0,1,0,0,1,1,1,1,0,0,2,0,1,1,mammal
gnat,0,0,1,0,1,0,0,0,0,1,0,0,6,0,0,0,insect
goat,1,0,0,1,0,0,0,1,1,1,0,0,4,1,1,1,mammal
gorilla,1,0,0,1,0,0,0,1,1,1,0,0,2,0,0,1,mammal
gull,0,1,1,0,1,1,1,0,1,1,0,0,2,1,0,0,bird
haddock,0,0,1,0,0,1,0,1,1,0,0,1,0,1,0,0,fish
hamster,1,0,0,1,0,0,0,1,1,1,0,0,4,1,1,0,mammal
hare,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,0,mammal
hawk,0,1,1,0,1,0,1,0,1,1,0,0,2,1,0,0,bird
herring,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,fish
honeybee,1,0,1,0,1,0,0,0,0,1,1,0,6,0,1,0,insect
housefly,1,0,1,0,1,0,0,0,0,1,0,0,6,0,0,0,insect
kiwi,0,1,1,0,0,0,1,0,1,1,0,0,2,1,0,0,bird
ladybird,0,0,1,0,1,0,1,0,0,1,0,0,6,0,0,0,insect
lark,0,1,1,0,1,0,0,0,1,1,0,0,2,1,0,0,bird
leopard,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,mammal
lion,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,mammal
lobster,0,0,1,0,0,1,1,0,0,0,0,0,6,0,0,0,shellfish
lynx,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,mammal
mink,1,0,0,1,0,1,1,1,1,1,0,0,4,1,0,1,mammal
mole,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,0,mammal
mongoose,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,mammal
moth,1,0,1,0,1,0,0,0,0,1,0,0,6,0,0,0,insect
newt,0,0,1,0,0,1,1,1,1,1,0,0,4,1,0,0,amphibian
octopus,0,0,1,0,0,1,1,0,0,0,0,0,8,0,0,1,shellfish
opossum,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,0,mammal
oryx,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,mammal
ostrich,0,1,1,0,0,0,0,0,1,1,0,0,2,1,0,1,bird
parakeet,0,1,1,0,1,0,0,0,1,1,0,0,2,1,1,0,bird
penguin,0,1,1,0,0,1,1,0,1,1,0,0,2,1,0,1,bird
pheasant,0,1,1,0,1,0,0,0,1,1,0,0,2,1,0,0,bird
pike,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,1,fish
piranha,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,fish
pitviper,0,0,1,0,0,0,1,1,1,1,1,0,0,1,0,0,reptile
platypus,1,0,1,1,0,1,1,0,1,1,0,0,4,1,0,1,mammal
polecat,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,mammal
pony,1,0,0,1,0,0,0,1,1,1,0,0,4,1,1,1,mammal
porpoise,0,0,0,1,0,1,1,1,1,1,0,1,0,1,0,1,mammal
puma,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,mammal
pussycat,1,0,0,1,0,0,1,1,1,1,0,0,4,1,1,1,mammal
raccoon,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,mammal
reindeer,1,0,0,1,0,0,0,1,1,1,0,0,4,1,1,1,mammal
rhea,0,1,1,0,0,0,1,0,1,1,0,0,2,1,0,1,bird
scorpion,0,0,0,0,0,0,1,0,0,1,1,0,8,1,0,0,shellfish
seahorse,0,0,1,0,0,1,0,1,1,0,0,1,0,1,0,0,fish
seal,1,0,0,1,0,1,1,1,1,1,0,1,0,0,0,1,mammal
sealion,1,0,0,1,0,1,1,1,1,1,0,1,2,1,0,1,mammal
seasnake,0,0,0,0,0,1,1,1,1,0,1,0,0,1,0,0,reptile
seawasp,0,0,1,0,0,1,1,0,0,0,1,0,0,0,0,0,shellfish
skimmer,0,1,1,0,1,1,1,0,1,1,0,0,2,1,0,0,bird
skua,0,1,1,0,1,1,1,0,1,1,0,0,2,1,0,0,bird
slowworm,0,0,1,0,0,0,1,1,1,1,0,0,0,1,0,0,reptile
slug,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,shellfish
sole,0,0,1,0,0,1,0,1,1,0,0,1,0,1,0,0,fish
sparrow,0,1,1,0,1,0,0,0,1,1,0,0,2,1,0,0,bird
squirrel,1,0,0,1,0,0,0,1,1,1,0,0,2,1,0,0,mammal
starfish,0,0,1,0,0,1,1,0,0,0,0,0,5,0,0,0,shellfish
stingray,0,0,1,0,0,1,1,1,1,0,1,1,0,1,0,1,fish
swan,0,1,1,0,1,1,0,0,1,1,0,0,2,1,0,1,bird
termite,0,0,1,0,0,0,0,0,0,1,0,0,6,0,0,0,insect
toad,0,0,1,0,0,1,0,1,1,1,0,0,4,0,0,0,amphibian
tortoise,0,0,1,0,0,0,0,0,1,1,0,0,4,1,0,1,reptile
tuatara,0,0,1,0,0,0,1,1,1,1,0,0,4,1,0,0,reptile
tuna,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,1,fish
vampire,1,0,0,1,1,0,0,1,1,1,0,0,2,1,0,0,mammal
vole,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,0,mammal
vulture,0,1,1,0,1,0,1,0,1,1,0,0,2,1,0,1,bird
wallaby,1,0,0,1,0,0,0,1,1,1,0,0,2,1,0,1,mammal
wasp,1,0,1,0,1,0,0,0,0,1,1,0,6,0,0,0,insect
wolf,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,mammal
worm,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,shellfish
wren,0,1,1,0,1,0,0,0,1,1,0,0,2,1,0,0,bird
""",
source='http://www1.ics.uci.edu/pub/machine-learning-databases/zoo/',
doc="""
1. Title: Zoo database
2. Source Information
-- Creator: Richard Forsyth
-- Donor: Richard S. Forsyth
8 Grosvenor Avenue
Mapperley Park
Nottingham NG3 5DX
0602-621676
-- Date: 5/15/1990
3. Past Usage:
-- None known other than what is shown in Forsyth's PC/BEAGLE User's Guide.
4. Relevant Information:
-- A simple database containing 17 Boolean-valued attributes. The "type"
attribute appears to be the class attribute. Here is a breakdown of
which animals are in which type: (I find it unusual that there are
2 instances of "frog" and one of "girl"!)
Class# Set of animals:
====== ===============================================================
1 (41) aardvark, antelope, bear, boar, buffalo, calf,
cavy, cheetah, deer, dolphin, elephant,
fruitbat, giraffe, girl, goat, gorilla, hamster,
hare, leopard, lion, lynx, mink, mole, mongoose,
opossum, oryx, platypus, polecat, pony,
porpoise, puma, pussycat, raccoon, reindeer,
seal, sealion, squirrel, vampire, vole, wallaby,wolf
2 (20) chicken, crow, dove, duck, flamingo, gull, hawk,
kiwi, lark, ostrich, parakeet, penguin, pheasant,
rhea, skimmer, skua, sparrow, swan, vulture, wren
3 (5) pitviper, seasnake, slowworm, tortoise, tuatara
4 (13) bass, carp, catfish, chub, dogfish, haddock,
herring, pike, piranha, seahorse, sole, stingray, tuna
5 (4) frog, frog, newt, toad
6 (8) flea, gnat, honeybee, housefly, ladybird, moth, termite, wasp
7 (10) clam, crab, crayfish, lobster, octopus,
scorpion, seawasp, slug, starfish, worm
5. Number of Instances: 101
6. Number of Attributes: 18 (animal name, 15 Boolean attributes, 2 numerics)
7. Attribute Information: (name of attribute and type of value domain)
1. animal name: Unique for each instance
2. hair Boolean
3. feathers Boolean
4. eggs Boolean
5. milk Boolean
6. airborne Boolean
7. aquatic Boolean
8. predator Boolean
9. toothed Boolean
10. backbone Boolean
11. breathes Boolean
12. venomous Boolean
13. fins Boolean
14. legs Numeric (set of values: {0,2,4,5,6,8})
15. tail Boolean
16. domestic Boolean
17. catsize Boolean
18. type Numeric (integer values in range [1,7])
8. Missing Attribute Values: None
9. Class Distribution: Given above
""")
iris = DataSet(name="Iris",
attrnames="sepal-len sepal-width petal-len petal-width class", target="class",
examples="""5.1,3.5,1.4,0.2,setosa
4.9,3.0,1.4,0.2,setosa
4.7,3.2,1.3,0.2,setosa
4.6,3.1,1.5,0.2,setosa
5.0,3.6,1.4,0.2,setosa
5.4,3.9,1.7,0.4,setosa
4.6,3.4,1.4,0.3,setosa
5.0,3.4,1.5,0.2,setosa
4.4,2.9,1.4,0.2,setosa
4.9,3.1,1.5,0.1,setosa
5.4,3.7,1.5,0.2,setosa
4.8,3.4,1.6,0.2,setosa
4.8,3.0,1.4,0.1,setosa
4.3,3.0,1.1,0.1,setosa
5.8,4.0,1.2,0.2,setosa
5.7,4.4,1.5,0.4,setosa
5.4,3.9,1.3,0.4,setosa
5.1,3.5,1.4,0.3,setosa
5.7,3.8,1.7,0.3,setosa
5.1,3.8,1.5,0.3,setosa
5.4,3.4,1.7,0.2,setosa
5.1,3.7,1.5,0.4,setosa
4.6,3.6,1.0,0.2,setosa
5.1,3.3,1.7,0.5,setosa
4.8,3.4,1.9,0.2,setosa
5.0,3.0,1.6,0.2,setosa
5.0,3.4,1.6,0.4,setosa
5.2,3.5,1.5,0.2,setosa
5.2,3.4,1.4,0.2,setosa
4.7,3.2,1.6,0.2,setosa
4.8,3.1,1.6,0.2,setosa
5.4,3.4,1.5,0.4,setosa
5.2,4.1,1.5,0.1,setosa
5.5,4.2,1.4,0.2,setosa
4.9,3.1,1.5,0.1,setosa
5.0,3.2,1.2,0.2,setosa
5.5,3.5,1.3,0.2,setosa
4.9,3.1,1.5,0.1,setosa
4.4,3.0,1.3,0.2,setosa
5.1,3.4,1.5,0.2,setosa
5.0,3.5,1.3,0.3,setosa
4.5,2.3,1.3,0.3,setosa
4.4,3.2,1.3,0.2,setosa
5.0,3.5,1.6,0.6,setosa
5.1,3.8,1.9,0.4,setosa
4.8,3.0,1.4,0.3,setosa
5.1,3.8,1.6,0.2,setosa
4.6,3.2,1.4,0.2,setosa
5.3,3.7,1.5,0.2,setosa
5.0,3.3,1.4,0.2,setosa
7.0,3.2,4.7,1.4,versicolor
6.4,3.2,4.5,1.5,versicolor
6.9,3.1,4.9,1.5,versicolor
5.5,2.3,4.0,1.3,versicolor
6.5,2.8,4.6,1.5,versicolor
5.7,2.8,4.5,1.3,versicolor
6.3,3.3,4.7,1.6,versicolor
4.9,2.4,3.3,1.0,versicolor
6.6,2.9,4.6,1.3,versicolor
5.2,2.7,3.9,1.4,versicolor
5.0,2.0,3.5,1.0,versicolor
5.9,3.0,4.2,1.5,versicolor
6.0,2.2,4.0,1.0,versicolor
6.1,2.9,4.7,1.4,versicolor
5.6,2.9,3.6,1.3,versicolor
6.7,3.1,4.4,1.4,versicolor
5.6,3.0,4.5,1.5,versicolor
5.8,2.7,4.1,1.0,versicolor
6.2,2.2,4.5,1.5,versicolor
5.6,2.5,3.9,1.1,versicolor
5.9,3.2,4.8,1.8,versicolor
6.1,2.8,4.0,1.3,versicolor
6.3,2.5,4.9,1.5,versicolor
6.1,2.8,4.7,1.2,versicolor
6.4,2.9,4.3,1.3,versicolor
6.6,3.0,4.4,1.4,versicolor
6.8,2.8,4.8,1.4,versicolor
6.7,3.0,5.0,1.7,versicolor
6.0,2.9,4.5,1.5,versicolor
5.7,2.6,3.5,1.0,versicolor
5.5,2.4,3.8,1.1,versicolor
5.5,2.4,3.7,1.0,versicolor
5.8,2.7,3.9,1.2,versicolor
6.0,2.7,5.1,1.6,versicolor
5.4,3.0,4.5,1.5,versicolor
6.0,3.4,4.5,1.6,versicolor
6.7,3.1,4.7,1.5,versicolor
6.3,2.3,4.4,1.3,versicolor
5.6,3.0,4.1,1.3,versicolor
5.5,2.5,4.0,1.3,versicolor
5.5,2.6,4.4,1.2,versicolor
6.1,3.0,4.6,1.4,versicolor
5.8,2.6,4.0,1.2,versicolor
5.0,2.3,3.3,1.0,versicolor
5.6,2.7,4.2,1.3,versicolor
5.7,3.0,4.2,1.2,versicolor
5.7,2.9,4.2,1.3,versicolor
6.2,2.9,4.3,1.3,versicolor
5.1,2.5,3.0,1.1,versicolor
5.7,2.8,4.1,1.3,versicolor
6.3,3.3,6.0,2.5,virginica
5.8,2.7,5.1,1.9,virginica
7.1,3.0,5.9,2.1,virginica
6.3,2.9,5.6,1.8,virginica
6.5,3.0,5.8,2.2,virginica
7.6,3.0,6.6,2.1,virginica
4.9,2.5,4.5,1.7,virginica
7.3,2.9,6.3,1.8,virginica
6.7,2.5,5.8,1.8,virginica
7.2,3.6,6.1,2.5,virginica
6.5,3.2,5.1,2.0,virginica
6.4,2.7,5.3,1.9,virginica
6.8,3.0,5.5,2.1,virginica
5.7,2.5,5.0,2.0,virginica
5.8,2.8,5.1,2.4,virginica
6.4,3.2,5.3,2.3,virginica
6.5,3.0,5.5,1.8,virginica
7.7,3.8,6.7,2.2,virginica
7.7,2.6,6.9,2.3,virginica
6.0,2.2,5.0,1.5,virginica
6.9,3.2,5.7,2.3,virginica
5.6,2.8,4.9,2.0,virginica
7.7,2.8,6.7,2.0,virginica
6.3,2.7,4.9,1.8,virginica
6.7,3.3,5.7,2.1,virginica
7.2,3.2,6.0,1.8,virginica
6.2,2.8,4.8,1.8,virginica
6.1,3.0,4.9,1.8,virginica
6.4,2.8,5.6,2.1,virginica
7.2,3.0,5.8,1.6,virginica
7.4,2.8,6.1,1.9,virginica
7.9,3.8,6.4,2.0,virginica
6.4,2.8,5.6,2.2,virginica
6.3,2.8,5.1,1.5,virginica
6.1,2.6,5.6,1.4,virginica
7.7,3.0,6.1,2.3,virginica
6.3,3.4,5.6,2.4,virginica
6.4,3.1,5.5,1.8,virginica
6.0,3.0,4.8,1.8,virginica
6.9,3.1,5.4,2.1,virginica
6.7,3.1,5.6,2.4,virginica
6.9,3.1,5.1,2.3,virginica
5.8,2.7,5.1,1.9,virginica
6.8,3.2,5.9,2.3,virginica
6.7,3.3,5.7,2.5,virginica
6.7,3.0,5.2,2.3,virginica
6.3,2.5,5.0,1.9,virginica
6.5,3.0,5.2,2.0,virginica
6.2,3.4,5.4,2.3,virginica
5.9,3.0,5.1,1.8,virginica""",
doc="""1. Title: Iris Plants Database
Updated Sept 21 by C.Blake - Added discrepency information
2. Sources:
(a) Creator: R.A. Fisher
(b) Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
(c) Date: July, 1988
3. Past Usage:
- Publications: too many to mention!!! Here are a few.
1. Fisher,R.A. "The use of multiple measurements in taxonomic problems"
Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions
to Mathematical Statistics" (John Wiley, NY, 1950).
2. Duda,R.O., & Hart,P.E. (1973) Pattern Classification and Scene Analysis.
(Q327.D83) John Wiley & Sons. ISBN 0-471-22361-1. See page 218.
3. Dasarathy, B.V. (1980) "Nosing Around the Neighborhood: A New System
Structure and Classification Rule for Recognition in Partially Exposed
Environments". IEEE Transactions on Pattern Analysis and Machine
Intelligence, Vol. PAMI-2, No. 1, 67-71.
-- Results:
-- very low misclassification rates (0% for the setosa class)
4. Gates, G.W. (1972) "The Reduced Nearest Neighbor Rule". IEEE
Transactions on Information Theory, May 1972, 431-433.
-- Results:
-- very low misclassification rates again
5. See also: 1988 MLC Proceedings, 54-64. Cheeseman et al's AUTOCLASS II
conceptual clustering system finds 3 classes in the data.
4. Relevant Information:
--- This is perhaps the best known database to be found in the pattern
recognition literature. Fisher's paper is a classic in the field
and is referenced frequently to this day. (See Duda & Hart, for
example.) The data set contains 3 classes of 50 instances each,
where each class refers to a type of iris plant. One class is
linearly separable from the other 2; the latter are NOT linearly
separable from each other.
--- Predicted attribute: class of iris plant.
--- This is an exceedingly simple domain.
--- This data differs from the data presented in Fishers article
(identified by Steve Chadwick, spchadwick@espeedaz.net )
The 35th sample should be: 4.9,3.1,1.5,0.2,"Iris-setosa"
where the error is in the fourth feature.
The 38th sample: 4.9,3.6,1.4,0.1,"Iris-setosa"
where the errors are in the second and third features.
5. Number of Instances: 150 (50 in each of three classes)
6. Number of Attributes: 4 numeric, predictive attributes and the class
7. Attribute Information:
1. sepal length in cm
2. sepal width in cm
3. petal length in cm
4. petal width in cm
5. class:
-- Iris Setosa
-- Iris Versicolour
-- Iris Virginica
8. Missing Attribute Values: None
Summary Statistics:
Min Max Mean SD Class Correlation
sepal length: 4.3 7.9 5.84 0.83 0.7826
sepal width: 2.0 4.4 3.05 0.43 -0.4194
petal length: 1.0 6.9 3.76 1.76 0.9490 (high!)
petal width: 0.1 2.5 1.20 0.76 0.9565 (high!)
9. Class Distribution: 33.3% for each of 3 classes.""")
# Artificial, generated examples.
def Majority(k, n):
"""Return a DataSet with n k-bit examples of the majority problem:
k random bits followed by a 1 if more than half the bits are 1, else 0."""
examples = []
for i in range(n):
bits = [random.choice([0, 1]) for i in range(k)]
bits.append(utils.sum(bits) > k/2)
examples.append(bits)
return DataSet(name="majority", examples=examples)
def Parity(k, n, name="parity"):
"""Return a DataSet with n k-bit examples of the parity problem:
k random bits followed by a 1 if an odd number of bits are 1, else 0."""
examples = []
for i in range(n):
bits = [random.choice([0, 1]) for i in range(k)]
bits.append(utils.sum(bits) % 2)
examples.append(bits)
return DataSet(name=name, examples=examples)
def Xor(n):
"""Return a DataSet with n examples of 2-input xor."""
return Parity(2, n, name="xor")
def ContinuousXor(n):
"2 inputs are chosen uniformly form (0.0 .. 2.0]; output is xor of ints."
examples = []
for i in range(n):
x, y = [random.uniform(0.0, 2.0) for i in '12']
examples.append([x, y, int(x) != int(y)])
return DataSet(name="continuous xor", examples=examples)
def compare(algorithms=[MajorityLearner, NaiveBayesLearner,
NearestNeighborLearner, DecisionTreeLearner],
datasets=[iris, orings, zoo, restaurant, SyntheticRestaurant(20),
Majority(7, 100), Parity(7, 100), Xor(100)],
k=10, trials=1):
"""Compare various learners on various datasets using cross-validation.
Print results as a table."""
utils.print_table([[a.__name__.replace('Learner','')] +
[cross_validation(a(), d, k, trials) for d in datasets]
for a in algorithms],
header=[''] + [d.name[0:7] for d in datasets], round=2)
# Copyright (c) 2002,
Peter Norvig
# See also AI
Programming (Python),
Python.org
Tutorial,
Language Ref,
Libraries.