1. Define spanning tree
# -*- coding: utf-8 -*- # Functions for spanning trees from numpy import * import numpy as np import pandas as pd from math import log import operator # Calculate the Information Gain function of the dataset (information entropy is called Shannon entropy in machine learning practice) def calcInfoEnt(dataSet):#Label in this question i.e. good OR bad melon #dataSet each column is an attribute (end of column is Label) numEntries = len(dataSet) # Each line is a sample labelCounts = {} # Create a dictionary labelCounts for all possible categories. for featVec in dataSet: # Loop by row: i.e. rowVev takes every row in the dataset currentLabel = featVec[-1] #Therefore, fatVec[-1] takes the last value of each line, i.e. Label. if currentLabel not in (): # If the current Label in the dictionary does not already have a labelCounts[currentLabel] = 0 # then create the word by assigning it a value of 0 first labelCounts[currentLabel] += 1 # count, count the number of labels per category (this line is not limited by if) InfoEnt = 0.0 for key in labelCounts: # Iterate over each type of Label prob = float(labelCounts[key])/numEntries #Entropy accumulation for each type of Label InfoEnt -= prob * log(prob,2) The information entropy gain formula used for #ID3 return InfoEnt ### For a discrete feature: take all samples where the value of the feature is value. def splitDiscreteDataSet(dataSet, axis, value): #dataSet is the set of current nodes (to be divided), axis indicates the attribute on which the division is based, value is the value used for the division. retDataSet = [] #Assign a list to the return Data Set to store the for featVec in dataSet: if featVec[axis] == value: reducedFeatVec = featVec[:axis] # Features prior to this feature remain in the sample dataSet (featVec[axis+1:]) # features after this feature remain in the sample dataSet (reducedFeatVec) # Add this sample to the list return retDataSet ### For continuous features: return all samples where the feature has a value greater than value (split the set into two parts using value as a threshold). def splitContinuousDataSet(dataSet, axis, value): retDataSetG = [] # will store samples with values greater than value retDataSetL = [] # will store samples with values less than value for featVec in dataSet: if featVec[axis] > value: reducedFeatVecG = featVec[:axis] (featVec[axis+1:]) (reducedFeatVecG) else: reducedFeatVecL = featVec[:axis] (featVec[axis+1:]) (reducedFeatVecL) return retDataSetG,retDataSetL # Returns two sets, in the form of a tuple with two elements. ### Choose the best current segmentation feature based on InfoGain (and for continuous variables also choose what value to segment by) def chooseBestFeatureToSplit(dataSet,labels): numFeatures = len(dataSet[0])-1 baseEntropy = calcInfoEnt(dataSet) bestInfoGain = 0.0; bestFeature = -1 bestSplitDict = {} for i in range(numFeatures): #Iterate over all features: the following line takes the i-th of each row, which is the value of the i-th feature of all samples in the current set. featList = [example[i] for example in dataSet] #Determine if a feature is discrete if not (type(featList[0]).__name__=='float' or type(featList[0]).__name__=='int'): # For discrete features: find the entropy increase if the feature is divided into uniqueVals = set(featList) # Create a set from a list (get the value of the unique element of the list) newEntropy = 0.0 for value in uniqueVals: # Iterate through each value of the discrete feature subDataSet = splitDiscreteDataSet(dataSet, i, value)# Calculate the information entropy for each value taken prob = len(subDataSet)/float(len(dataSet)) newEntropy += prob * calcInfoEnt(subDataSet)# Entropy accumulation for each value taken infoGain = baseEntropy - newEntropy # Get the entropy increase divided by this feature # For a continuous feature: find the entropy increase if the feature is used to divide (difference: for n data, you need to add n-1 candidate divisions, and choose the best division point) else: # Generate n-1 candidate division points sortfeatList=sorted(featList) splitList=[] for j in range(len(sortfeatList)-1): # Generate n-1 candidate division points ((sortfeatList[j] + sortfeatList[j+1])/2.0) bestSplitEntropy = 10000 #Set a very large entropy value (to be used later) #Iterate over n-1 candidate points: find the entropy increase of the jth candidate point, and select the best point. for j in range(len(splitList)): value = splitList[j] newEntropy = 0.0 DataSet = splitContinuousDataSet(dataSet, i, value) subDataSetG = DataSet[0] subDataSetL = DataSet[1] probG = len(subDataSetG) / float(len(dataSet)) newEntropy += probG * calcInfoEnt(subDataSetG) probL = len(subDataSetL) / float(len(dataSet)) newEntropy += probL * calcInfoEnt(subDataSetL) if newEntropy < bestSplitEntropy: bestSplitEntropy = newEntropy bestSplit = j bestSplitDict[labels[i]] = splitList[bestSplit]# Dictionary to record the current optimal division point for consecutive attributes infoGain = baseEntropy - bestSplitEntropy # Calculate the entropy increase divided by this node # Select the attribute among all attributes (both continuous and discrete) that gives the largest entropy increase if infoGain > bestInfoGain: bestInfoGain = infoGain bestFeature = i #If the best segmentation feature of the current node is a continuous feature, it should be binarized according to "whether it is less than or equal to its best segmentation point". # i.e. change the feature to "whether it is less than or equal to bestSplitValue", e.g. change "Density" to "Density<=0.3815". #Note: The following paragraph directly manipulates the original dataSet data, and the previous float values become 0 and 1 accordingly. #[Why do this?] At the end of the function createTree() you will see the explanation if type(dataSet[0][bestFeature]).__name__=='float' or type(dataSet[0][bestFeature]).__name__=='int': bestSplitValue = bestSplitDict[labels[bestFeature]] labels[bestFeature] = labels[bestFeature] + '<=' + str(bestSplitValue) for i in range(shape(dataSet)[0]): if dataSet[i][bestFeature] <= bestSplitValue: dataSet[i][bestFeature] = 1 else: dataSet[i][bestFeature] = 0 return bestFeature # If the features have been divided and the samples under the node have not been standardized, then voting is required: count the number of labels in each category, and take the max. def majorityCnt(classList): classCount = {} # will create a dictionary with keys of type Label for vote in classList: if vote not in (): classCount[vote] = 0 # First occurrence of Label added to dictionary classCount[vote] += 1 # Counting return max(classCount)
2. Recursive generation of decision trees
# Main program: recursive generation of decision trees # dataSet: the current dataset used to build the tree, which starts out as data_full and gets smaller and smaller as it is divided. This is due to the fact that we have reached a fork in the tree. The first division of the data before the 17 melons is at the root node, and then the first bestFeat is chosen to be the texture. The texture values are clear, fuzzy, and slightly fuzzy; the melons are divided into clear (9), slightly fuzzy (5), and fuzzy (3), and then the category of the division should be reduced by 1 for the next division. # labels: the categories in the current dataset (this is because some labels are not in the current dataset, e.g. if at some point the watermelons are all light white and no dark green) # data_full: all of the data # label_full:all categories numLine = numColumn = 2 #This is because I'm going to use global numLine after this ...... as to why I must use global. # I don't fully understand it either. If I just define the local variable it always gives me an error and I have to use the global variable in the if there. I'm looking for a solution. def createTree(dataSet,labels,data_full,labels_full): classList = [example[-1] for example in dataSet] # Recursive stop condition 1: all samples of the current node belong to the same class; (Note: the count() method counts the number of times an element appears in the list) if (classList[0]) == len(classList): return classList[0] # Recursive stop condition 2: the set of samples on the current node is the empty set (i.e., there are no more samples at a given value of the feature): global numLine,numColumn (numLine,numColumn) = shape(dataSet) if float(numLine) == 0: return 'empty' # Recursive stop condition 3: all features available for segmentation are used, then majorityCnt() is called to vote on Label; if float(numColumn) == 1: return majorityCnt(classList) #Continue to divide when you don't stop: bestFeat = chooseBestFeatureToSplit(dataSet,labels)# Call the function to find out which is the current best segmentation feature bestFeatLabel = labels[bestFeat] # Current best segmentation characteristics myTree = {bestFeatLabel:{}} featValues = [example[bestFeat] for example in dataSet] uniqueVals = set(featValues) if type(dataSet[0][bestFeat]).__name__=='str': currentlabel = labels_full.index(labels[bestFeat]) featValuesFull = [example[currentlabel] for example in data_full] uniqueValsFull = set(featValuesFull) del(labels[bestFeat]) #After segmentation, the current feature has been used, so it is deleted from the "feature set to be segmented". # [Recursive call] Divide a subtree for each value of the feature (beatFeat) currently used for division. for value in uniqueVals: # Iterate over the [existing] values of the feature subLabels = labels[:] if type(dataSet[0][bestFeat]).__name__=='str': (value) # Delete after division (from uniqueValsFull!) myTree[bestFeatLabel][value] = createTree(splitDiscreteDataSet(dataSet,bestFeat,value),subLabels,data_full,labels_full)# use splitDiscreteDataSet() # is due to the fact that, all continuous features are processed into discrete values after splitting by our defined chooseBestFeatureToSplit(). if type(dataSet[0][bestFeat]).__name__=='str': # If the feature is discrete [see later note for more details] for value in uniqueValsFull:# then there may be some values that are no longer in the [existing] values. #That's why the above removes "uniqueValsFull" from #Because those values of that feature that were not taken in the existing dataset, were retained in it myTree[bestFeatLabel][value] = majorityCnt(classList) return myTree
3. Calling the spanning tree
Statements called by #SpanningTree df = pd.read_excel(r'E:\BaiduNetdiskDownload\spspss\data\experimental data\bank loans.xlsx') data = [:,1:].tolist() data_full = data[:] labels = [1:-1].tolist() labels_full = labels[:] myTree = createTree(data,labels,data_full,labels_full)
View Data
data
labels
4. Mapping decision trees
Functions for #plotting decision trees import as plt decisionNode = dict(boxstyle = "sawtooth",fc = "0.8") # Define branch point styles leafNode = dict(boxstyle = "round4",fc = "0.8") # Define the style of leaf nodes arrow_args = dict(arrowstyle = "<-") #Define the arrow logo style # Calculate the number of leaf nodes in the tree def getNumLeafs(myTree): numLeafs = 0 firstStr = list(())[0] secondDict = myTree[firstStr] for key in (): if type(secondDict[key]).__name__=='dict': numLeafs += getNumLeafs(secondDict[key]) else: numLeafs += 1 return numLeafs # Calculate the maximum depth of the tree def getTreeDepth(myTree): maxDepth = 0 firstStr = list(())[0] secondDict = myTree[firstStr] for key in (): if type(secondDict[key]).__name__=='dict': thisDepth = 1 + getTreeDepth(secondDict[key]) else: thisDepth = 1 if thisDepth > maxDepth: maxDepth = thisDepth return maxDepth # Draw the nodes def plotNode(nodeTxt,centerPt,parentPt,nodeType): createPlot.(nodeTxt,xy = parentPt,xycoords = 'axes fraction',xytext = centerPt,textcoords = 'axes fraction',va = "center", ha = "center",bbox = nodeType,arrowprops = arrow_args) # Mark the text on the arrow def plotMidText(cntrPt,parentPt,txtString): lens = len(txtString) xMid = (parentPt[0] + cntrPt[0]) / 2.0 - lens*0.002 yMid = (parentPt[1] + cntrPt[1]) / 2.0 createPlot.(xMid,yMid,txtString) def plotTree(myTree,parentPt,nodeTxt): numLeafs = getNumLeafs(myTree) depth = getTreeDepth(myTree) firstStr = list(())[0] cntrPt = (plotTree.x0ff + (1.0 + float(numLeafs))/2.0/,plotTree.y0ff) plotMidText(cntrPt,parentPt,nodeTxt) plotNode(firstStr,cntrPt,parentPt,decisionNode) secondDict = myTree[firstStr] plotTree.y0ff = plotTree.y0ff - 1.0/ for key in (): if type(secondDict[key]).__name__=='dict': plotTree(secondDict[key],cntrPt,str(key)) else: plotTree.x0ff = plotTree.x0ff + 1.0/ plotNode(secondDict[key],(plotTree.x0ff,plotTree.y0ff),cntrPt,leafNode) plotMidText((plotTree.x0ff,plotTree.y0ff),cntrPt,str(key)) plotTree.y0ff = plotTree.y0ff + 1.0/ def createPlot(inTree): fig = (1,facecolor = 'white') () axprops = dict(xticks = [],yticks = []) createPlot.ax1 = (111,frameon = False,**axprops) = float(getNumLeafs(inTree)) = float(getTreeDepth(inTree)) plotTree.x0ff = -0.5/ plotTree.y0ff = 1.0 plotTree(inTree,(0.5,1.0),'') ()
5. Calling functions
The # command plots a decision tree. createPlot(myTree)
myTree
summarize
To this article on Python3 ID3 decision tree to determine the success of the loan application code is introduced to this article, more related python ID3 decision tree to determine the contents of the search for my previous articles or continue to browse the following related articles I hope you will support me in the future more!