scikit-learn偕g \ @BwK̗wڂF@BwK - IT

scikit-learn偕g \ @BwK̗wڂF@BwK

um[wׂvbg[ɂ@BwKAڂ̑2BHŖ𗧂APythonCů{IȎgpƂāAf[^̓ǂݍ݂ƉHipandasgpjAlvZiNumPygpjƃf[^ŽiMatplotlib^seaborngpjA@BwKiscikit-learn̎gj܂ł̌ȂwڂB

» 2024N0819 0500 J
[FFCfW^Ahoe[W]
u@BwKṽCfbNX

Aږڎ

XV܂iJF2024N411AXVF2024N819j

2024N819ŐVColab‹ŁAL̑SẴR[hɓ삷邱Ƃ؂܂B

@ÓA@BwK̊bƁAvPythonCůTv܂B

@́APythong@BwKvO~O̊{IȗAۂɃR[hȂ̌IɊwł܂傤B̓Iɂ́Af[^̓ǂݍ݂ƉHAOtɂŽAvIȐlvZAĊȒPȋ@BwKf̍\z܂ŁA{IȈǍ̗ł܂i}1jB

}1@{e̖ړIFPythonɂ@BwK̊{Iȗwڂ }1@{e̖ړIFPythonɂ@BwK̊{Iȗwڂ

Ŋwׂ邱

@}1̒ʂA@BwKvO~O̊{IȗɉĐi߂ƁA1ŏЉvPythonCuipandasANumPyAMatplotlibAseabornAscikit-learnȂǁjeʂŎg邱ƂɂȂ܂B

@eCu[ĎgȂ߂ɂ́A•ʂɏڂwԂƂKvłB{Aڂł́Aڍׂɂ͐GꂸAHŖ𗧂Š{IȎgpɍiĐ܂BƐ[@艺ĊwтĺAwPythonf[^xA𕹓ǂ邱Ƃ߂܂B

@܂Ƃ߂ƁA͐}2ɎewԂƂł܂B

}2@{LŊwׂ邱 }2@{LŊwׂ邱

@ł́A܂͍gpf[^̏Љn߂Ă܂B

AځF

wPythonŊwԁu@BwKvx

PythonŊwԁu@BwKv

@u@BwK͓vƎvĂ܂񂩁H@Sz͗v܂B̘Aڂł́Aum[wׂvbg[ɁA@BwK̊bƊe@}ƊȌȐŕ₷܂BPythongHK܂̂ŁA̎𓮂ƂŎpIȃXLgɕt܂B

@񂩂́A̓Iȋ@BwK̎@iF`AA؁Ak-meansȂǁjĂ܂Bȍ~̐VLȂ悤ɁAЈȉ̃[ʒm̓o^肢܂B


0. gpf[^Zbg

@́A߁iIrisjƂԂf[^ZbgiDatasetFf[^̏W܂jg܂izzFhttps://doi.org/10.24432/C56C76ACZXFCC BY 4.0jB


jRjRΊ̃}iu

@Irisf[^ZbǵA@BwK̊{IȗwԏŗzIȓĂ܂Bϐ4‚ƃVvŁAf[^150ƏȂȂKv\łBŜeՂɔcłK͂Ȃ̂AS҂Ƀsb^Ȃ̂ō̗p܂B

@@BwK̏SҌ`[gAł悭gĂ̂Łu܂ccvƎv邩܂񂪁A@BwK̊{Iȗ̌ł悤ɏAWĂ܂̂ŁAVȋCŎgł炦ƂꂵłB


@Irisf[^Zbg̐ϐij́A

  • Sepal LengthF Ђ̒icmj
  • Sepal WidthF Ђ̕icmj
  • Petal LengthF Ԃт̒icmj
  • Petal WidthF Ԃт̕icmj

4ڂƂȂĂ܂BȂAЂԂтuԁi͂ȁjv\vfŁAԂی삷uӕЁiւjv͉Ԃ̈ԊOɂAuԕفi͂ȂтAׂjv͂Ђɂ܂i}1jB߂́AЂɔłˁB

}3@Irisf[^Zbg̓e }3@Irisf[^Zbg̓e
Irisf[^Zbg̏ڍׂ͂̋LQƂĂB

@Irisf[^Zbg̖ړIϐi^[QbgAxj́A߂̎ށiNXFClassjłB̓Iɂ́A

  • setosaF Zg[Ti{FqIEMAj
  • versicolorF @[VJ[i{Fu[tbOj
  • virginicaF @[WjJi{F@[WjJj

3ނ܂Bꂼ50‚ŁAv150łB

@̋@BwKł́Af[^i4ڂ̓ʁjɁA߂̎ނ\邱ƂڕWƂȂ܂B‚܂肱́Aޖ^XNłB

@ł́Ãf[^ZbgPythonœǂݍł݂܂傤B

m[gubN̗pɂ‚

@{Aڂ́A1Ő悤ɁÃNEh‹uGoogle Colabv̗pOƂĂ܂B{Iɂ́AColabŐVKm[gubNAȍ~ŐR[h͂Ȃsʂ̖ڂŊm߂ĂBɓ͍ς݂̃m[gubNgꍇ́ÃTvm[gubNpB

1. Tvf[^ǂݍł݂悤ipandasgpj

@PythonCSVt@CExcelt@C̃f[^ǂݍނȂApandasƂCuƂĂ֗łiQlLjB֐ĂяoꔭŃf[^ǂݍ߂ĕ֗Ȃ̂ŁA@BwK̎Hł悭gĂ܂BȂA@BwKɎgePythonCu1Ő̂ŁA͑SĐ܂B

@pandasɂ́ACSVt@Cpread_csv()֐AExcelt@Cpread_excel()֐pӂĂ܂Bt@CVXẽpXAC^[lbgURLA֐̑1Ɏ󂯎܂BāAURLw肵ăC^[lbgɂIrisf[^Zbgǂݍނɂ́AXg1̂悤ɏĂił́AqOۂɎĂ݂邽߂ɁAf[^ς̂M҂GitHub|WgŔzzĂ܂jB

import pandas as pd

# f[^̓ǂݍ
url = 'https://raw.githubusercontent.com/isshiki/machine-learning-with-python/main/02-scikit-learn/iris_processed.csv'
df = pd.read_csv(url)

# f[^̊mF
df.head()

Xg1@pandasŃf[^ǂݍ
m[gubNł́AeR[hZ̍ŌɋLqꂽu֐ϐȂǁv̏o͂Iɕ\idisplayjdlłB̓𗘗pāA{eł͑SčŌprint()֐ȗĂ܂BA.pyt@CȂǂŃXNvgsďo͂ꍇ́Aprint(df.head())̂悤ɖIɏo͂wĂB

@pandasC|[gƂ̕ʖ́AʏApdƂ܂Bpd.read_csv(url)œǂݍ񂾃f[^́ADataFrameif[^t[jƌĂ΂2i\`jf[^̃IuWFNgƂĕϐdfɊ蓖Ă܂B

@ɓǂݍ߂ǂmF邽߂ɁAdfIuWFNghead()\bhĂяoĂ܂BɂAǂݍ񂾃f[^̐擪5s\܂i}4jB

}4@ǂݍ񂾃f[^̓e\ }4@ǂݍ񂾃f[^̓e\

@Sso͂ƕ\ɂȂ̂ŁÂ悤ɐ擪5s\Ă܂B@BwKŕpɂɎgeNjbN̈‚łB


ł}iu

@̃f[^150ƏʂłA̋@BwKvWFNgł͂΂΋ȃf[^Zbg܂BpandasőK̓f[^ZbgǂݍƂƁARs[^̕iRAMj̕sɂ胁G[邱Ƃ܂B

@̖‚̕@́AuDaskvƂCupandas̑Ɏgp邱ƂłBDaskpandasƌ݊AK̓f[^̏ɓKĂ܂B

@AS҂̒iKłDask̊wK͕K{ł͂܂Bȃf[^ZbgKvoĂƂɁADaskpandas̃ǗɊւΏ@wԂƂ߂܂iΏ@̎QlLjB


ǂݍ񂾃f[^́Â܂܎gȂHI@O̕Kv

@f[^pӂłA@BwKn߂悤ccƂƂɂ́AʏAȂ܂Bf[^mFɎĝ͊댯łBɎ͂ꂽf[^̏ꍇAꕔ̒lē͂ꂽAĂ肷邱Ƃ悭܂B

@Ⴆ0.0011.0ƌ͂Ȃǂُli͑̒lƑ傫ႤOlj܂܂ĂA\f[^̈ꕔ󗓁A‚܂l܂܂Ă肷邱Ƃ܂BႦΑOf̐}4ɂ\f[^3s2ڂ̃ZɁuNaNviNot a NumberF񐔁jƕ\Ă܂ÁulvӖ܂B

@܂A}4́mClassnɂ́usetosavƂ񂪕\Ă܂Bscikit-learn͐l݂̂̂ŁAJeS[lij́AOɐliPythonint^float^jɒuĂKv܂B

@ُl⌇l̏AJeS[l̐lւ̒uȂǁAOɃf[^f[^͂@BwKɓK`ɐƂOi܂FPreprocessingjƌĂ΂܂BȉɁA\IȑO̍Ƃ܂Ƃ߂Ă܂B

  • f[^N[jOF f[^̕iコ邽߂ɁAlُlAOlȂǁB
  • f[^̕ϊF @BwKfł`ɁAJeS[l𐔒lf[^ɕϊȂǁB
  • K^WF @BwKf₷悤ɁAʁiϐj̃XP[iPʁj𓝈ꂷ邱ƁBqB

@@BwK̐Eł́ȂOiƌq̃RɏʃGWjAOjɑ唼̎ԁiɂ8j₷ƌĂ܂BA̎Ԃɂ܂ɒJɍsƂŁA@BwKf̐\啝Ɍシ”\܂B

@ł́Aقǂ̐ƏԂOサ܂AuJeS[l̐lւ̒uvul̏vuُl̏v̏őOĂ݂܂傤B

yRzʃGWjAO

@OƈꕔdƂɁAʃGWjAO܂BO̓f[^ꂢɐ邱Ƃɏœ_𓖂ĂĂ̂ɑ΂AʃGWjAOiFeature Engineeringj́u@BwKf̐\コv߂ɐVʁiϐjoƂɏœ_𓖂ĂĂ܂B̓Iɂ́AɈȉ̍Ƃs܂B

  • ʂ̍쐬F ̃f[^VʂoBႦ΁mtnmjnoB܂AmNnƁmnƂ̓ʂgݍ킹āmw̓XRAnƂVʂ쐬ȂǁB
  • ʂ̑IF @BwKf̐\ɑ傫^ʂ𒲂ׂđIB܂A֌W܂͗ގ̓ʂ폜ȂǁB

@ʃGWjAOsɂ́A܂f[^[Kv܂B̂߂ɁA܂܂PythonCugăf[^𕪐͂Ž肷킯łB

@܂Af[^𐶂ޕɂmihCmƌĂ΂܂jKvłBႦΖ싅f[^Ȃuqbgvuz[vȂǂ̒m͕sŒłB̒mɂāAf[^Ӗ̂ʂ‚o邩łB


2. OFJeS[l𐔒lɒu悤ipandasgpj

@ł́ADataFrameipdIuWFNgj́mClassnɊ܂܂镶̃JeS[lAint^̐lɒuĂ݂܂BR[h́AXg2̂悤ɏĂB

# JeS[l𐔒lɃ}bsO
class_mapping = {'setosa': 0, 'versicolor': 1, 'virginica': 2}
df['Class_ID'] = df['Class'].map(class_mapping)

# f[^̊mF
df.head()

Xg2@pandasŃJeS[l𐔒lɕϊ

@̃R[hɂAsetosa^versicolor^virginicáAꂼ0^1^2ƂlɒuAVȁmClass_IDnDataFrameɒlj܂BϊO̒lQƂł悤ɁÁmClassncĂ܂B}5͎sʂłB

@@BwKł́AʏAJeS[l0i1jn܂AԂ̐lɒu肵܂i̎@xGR[fBOƌĂ΂܂jBӖƒJeS[iႦ΁u5iK̖xvȂǁj̏ꍇ́ȀɉĐl蓖Ă܂i̎@GR[fBOƌĂ΂܂jBȂ݂ɁȂɂAzbgGR[fBOȂǂ̎@܂A{eł͐܂B

@JeS[lAԂ̐lɌIɕϊɂ́Amap()\bh֗łB́ApandasSeriesiV[YA̗łdf['Class']Ŏ擾mClassn\1f[^̃IuWFNgjɊ܂܂郁\bhŁAȗgݍ킹i}bsOjdictIuWFNgi̗łclass_mappingjŎ󂯎āAɏ]ϊĂ܂B

}5@JeS[l𐔒lɕϊf[^̓e\ }5@JeS[l𐔒lɕϊf[^̓e\


j}Ί̃}iu

@JeS[l܂ޗ̃f[^́AJeS[ϐJeSJf[^ƂĂ΂܂B


3. OFlȂmFAΑΏ悤

l̗L𒲂ׂipandasgpj

@pandas DataFrame̊eɁulv邩ǂmFĂ݂܂傤BR[h̓Xg3̒ʂłB

# eɂ錇l̐mF
df.isna().sum()

Xg3@pandasŌl𒲂ׂď

@isna()\bh́ANaNȂǂ̌lzvfTrueAȊO̔zvfɂFalseݒ肵AVDataFrameԂ܂B

@sum()\bh́ADataFrame̗PʂōvlԂ܂B

@}6sʂłBbooll𐔒lɂƁATrue1ŁAFalse0Ȃ̂ŁAeuvlv͂̂܂܁ul̐vӖ邱ƂɂȂ܂B

}6@ě̕\ }6@ě̕\

@mSepal WidthnɁA1̌l邱ƂmFł܂BOf̐}4ɂNaNłˁB

lipandasgpj

@lɑΏɂ́AɈȉ̕@܂B

  • l̂s̍폜F łʓIȑΏ@Bdropna()\bhgpłBf[^\ɂꍇɗLAf[^㏞B
  • lŕplȂǂŕ⊮F Al̏ꍇ͒lAJeS[l̏ꍇ͍ŕplȂǂ̓vʂŁAl𖄂߂Bfillna()\bhgpłByAf[^̕zci䂪j߂ċ@BwǨʂɉe^”\B
  • ʃASYŕ⊮F 炩̃ASYgpăf[^̊m̕琄āAl𖄂߂BႦkߖT@ik-NNA̘AڂŐ\jASYɂ⊮iimputej́Ascikit-learnKNNImputerNXgpłBvZRXg͍܂邪Af[^̓萳mɔfł”\B

@̗ł͌l1‚ȂAu1s폜Ă@BwǨʂɑ債e͂Ȃvƍl̂ŁAul̂s̍폜vsƂɂ܂iXg4jB

# l̂s폜
df_dropped = df.dropna()

# ʂ̏o
print('f[^̍sF ', len(df))
print('lς݃f[^̍sF ', len(df_dropped))

Xg4@pandasŌl̂s폜

@dropna()\bh̎gɂ‚ĕ⑫ƁAftHgł͌l܂ލs폜܂Aaxis=1w肷Ɨ폜܂B̗ɑ̌l܂܂ĂꍇiႦ΁Ã̗f[^̔ȏオĂȂǂ̏ꍇjA̗̍폜ĂB

@Pythonlen()֐̈DataFramei̗łdf_droppedȂǁjnƁA̍s܂B}7͂̎sʂŁAm1sĂ邱ƂmFł܂B

}7@lʂ̕\ }7@lʂ̕\


ł}iu

@Xg4df.dropna()Ƃ͈ɂ܂BႦ΁Adf[~df.isna().any(axis=1)]ƏƂł܂iR[he̐͊܂jBȍ~̃R[hA܂ŏ̈ɂȂ̂łӂB


4. OFُlȂmFAΑΏ悤

OtُloiMatplotlibgpj

@ɁApandas DataFrame̊eɁuُlviuOlvj邩ǂmFĂ݂܂傤BُĺAf[^OtƂĉŽƈڂ傤RłB

@ُľoɖ𗧂ƒOtƂẮAႦΈȉ̂̂܂B}̏ڍׂ́ANȂǂmFĂB

  • Ђ}F f[^̕zlʔ͈́i25`75j̔ŕ\Al^ŏl^ől̑AOlȊۂŕ\BُliOljoIɊmF₷iQlLjB
  • Uz}F f[^2ϐi2‚̓ʁjԂ̊֌W_ŃvbgijBُĺÃf[^|Cg痣ꂽ_ƂĖڗ‚̂Ŕc₷iQlLjB
  • qXgOF f[^̕zאڂ_Otŕ\BُĺAz̊Oꂽɂ鏬Ȗ_ƂČołiQlLjB
  • nvbgi܂OtȂǁjF Ot̉ɎԎĒlvbgBԂ̌o߂ƂƂɃf[^ǂ̂悤ɕωĂ邩mFłBُĺÃf[^|Cgl܂͒ႢlƂĕ\iQlLjB

@͍ł{IȔЂ}`悵Ă݂܂傤iXg5jB

import matplotlib.pyplot as plt

# 4‚̓ʁiϐjI
df_features = df_dropped.loc[:, ['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width']]

# Ђ}\
df_features.boxplot()
plt.show()

Xg5@Matplotlibňُlo邽߂̔Ђ}`悷

@PythonŊ{IȃOt`悷ɂ́ACuuMatplotlibvg܂B̃Ot`惂W[łmatplotlib.pyplotC|[gƂ̕ʖ́AʏApltƂ܂B

@ł́A4‚̓ʂɑ΂ĔЂ}쐬邱Ƃɂ܂BŁADataFrameʂ𒊏o܂Bɂ́Asf[^̈ꕔ𒊏ołloc\bhg܂B

@Xg6ł́Aloc\bḧɑ΂:őSsA["Sepal Length", "Sepal Width", "Petal Length", "Petal Width"]4‚̓ʂw肳Ă܂Bdf_featuresϐɂ4‚̓ʂ܂ސVDataFrame蓖Ă܂B

@DataFrameboxplot()\bhĂяoƁAMatplotlibgЂ}쐬܂B

@쐬ꂽOtm[gubNɕ\ɂ́Aplt.show()\bhĂяo܂B}8͂̎sʂłB

}8@ُl𔭌邽߂ɔЂ}\ }8@ُl𔭌邽߂ɔЂ}\

@mSepal Widthnɂ͏Ȋۂ‚\ĂAiЂ}̊ŁjOl悤łAɒ[ɗꂽꏊɂ͕\ĂȂ̂ŋe͈͂ł傤Bُ͈l͂Ȃ̂ƂāÂ܂ܑSẴf[^g悤ɂ܂B

@ُlꍇ́Aقǂ̌lƓ悤ɍ폜邩⊮ȂǁA󋵂ɉĔf܂傤B{Iɂ́Aُl̂s폜̂߂łBOluُvł͂ȂԂ𔽉fĂƍlꍇ́Â܂ܗp̂K؂łB

vʂُlmFipandasgpj

@OtłȂAvʂ̐lłdɊmFĂƈSłBXg6̃R[hłB

# Iʂ̊bvʂ\
df_features.describe()

Xg6@pandasœvʂ𒲂ׂ

@4‚̓ʂ܂DataFramedescribe()\bhĂяoƁA}9̂悤ɓvʂ܂Ƃ߂ĕ\܂B֗łˁB

}9@{Iȓvʂ܂Ƃ߂ĕ\ }9@{Iȓvʂ܂Ƃ߂ĕ\
count̓f[^ŁA149ƂȂĂ܂BȊO̍ږ́Aȉ̖{ɓoꂵ܂B

@vʂ邱ƂŁAf[^̑S̑cAُl݂̑邱Ƃł܂BɁAŏliminj1lʐi25jA͍őlimaxj3lʐi75j傫Ăꍇ́Aُl̉”\ƍl܂B

@ɂ́Aϒlimeanjƒli50j傫ijĂAW΍istdjُɑ傫A‚܂f[^Lo‚Ă肷ꍇ́Af[^̕΂ُl݂̑Ô\܂B

ypeNjbNzꂽŽƐlvZ

荂xȃOt`iseaborngpj

@قǂُ͈ľoɖ𗧂ƒOtƂĔЂ}Uz}ȂǂЉ܂AMatplotlibō쐬ł܂BseabornCup邱ƂŁA葽lȃOt쐬ł悤ɂȂ܂B̈Ⴊur[XEH[}vłB

  • r[XEH[}iBee Swarm PlotFIQ}jF Uz}ƔЂ}̓gݍ킹̂ŁAf[^|Cgd˂ɕ\Af[^̕z薾mɎBُĺÃf[^|Cg痣ꂽ_ƂĖڗ‚̂Ŕc₷BAK͂ȃf[^TCYŁAK̓f[^Zbgɂ͌ȂB

@ł̓r[XEH[}쐬Ă݂܂傤B܂̓Xg7̃R[hsāAseabornCuCXg[܂B

! pip install seaborn

Xg7@seabornCuCXg[
m[gubÑR[hZł́A!Ŏn܂sOS̃VFR}hƂĎs܂B
PythonfBXgr[VuAnacondavcondagĂꍇ́Aconda install seabornR}h^[~iŎsĂB

@ɁAr[XEH[}`悵܂iXg8jBseabornMatplotlibx[X̃CuȂ̂ŁA͂matplotlib.pyplotW[̃C|[gi{IɁjKvłB

# seabornCũC|[g
import seaborn as sns
import matplotlib.pyplot as plt

# seabornŃr[XEH[}쐬
sns.swarmplot(data=pd.melt(df_features), x="variable", y="value", size=2.5)
plt.show()

Xg8@seabornňُlo邽߂̃r[XEH[}`悷

@CuuseabornvseabornW[C|[gƂ̕ʖ́AʏAsnsƂ܂BȂ݂Ɂusnsv́Aerh}̓oluSamuel Norman SeabornvɗR邻łB

@r[XEH[}쐬ɂ́Asns.swarmplot()\bhĂяo܂Bdatapd.melt(df_features)w肵Ă܂Aswarmplot()\bhuO`v̕\`f[^iTabular dataFsƗ̌`Őꂽf[^jOƂĂ邽߂łB

@O`iLong-formjƂ͕\`f[^ucv`ŁA܂Ch`iWide-formjƂ́uv`Ő@łiQlLjBႦIrisf[^ZbǵAes1‚̃f[^|CgŁAe񂪓ʁiЂ̒AЂ̕AԂт̒AԂт̕jɂȂĂ郏Ch`̕\`f[^łB

@O`ɕϊꍇAeʂ•ʂ4‚̗񂩂uʂ̎ށvƂ1‚̗ɂ܂Ƃ߁Aulv1‚̗ŕ\܂i}10jB

}10@Irisf[^ZbgCh`烍O`ɕϊ }10@Irisf[^ZbgCh`烍O`ɕϊ
0A1AccƂԍ́ADataFrame̊esӂɎʂ邽߂CfbNXiIndexjłB

@̂悤ȃCh烍Oւ̌`ΐApd.melt()\bhĂяołł܂BftHgŁuʂ̎ށvvariableAulvvalueƂOɂȂ܂Bsns.swarmplot()\bhx="variable", y="value"Ƃ̖͂Ow肵Ă킯łB

@size=2.5́Avbg_if[^|Cgj̑傫w肵Ă܂Br[XEH[}́A_d˂ɕ\dlłBf[^Ɠ_dȂA\bhĂяoɌx\܂B邽߂ɁA_̂Ă܂B

@Ƃ͐قǂƓlplt.show()\bhŁAm[gubNɕ\邾łi}11jB

}11@ُl𔭌邽߂Ƀr[XEH[}\ }11@ُl𔭌邽߂Ƀr[XEH[}\

@̃r[XEH[}ɂ́A傫ꂽf[^|Cg͂ȂAُl݂Ȃ̂mFł܂B

yRzTIf[^͂ŗLpseabornpairplot

@seabornpairplot֐LpȂ̂ŏЉ܂Bg͊ȒPȂ̂Ősvł傤iXg9jBȂ݂ɁApandasɂl̂Ƃłscatter_matrix()֐݂܂B

sns.pairplot(df_features)
plt.show()

Xg9@seabornŊeʂ̃yAɑ΂Uz}\

@̊֐́ADataFrame̊eʂŃyAđ̃Otxɍ쐬Ă܂B4‚̓ʂꍇA16‚̐}i4~4̃}gbNXj쐬܂i}12jB

}12@ʊԂ̊֌WUz}ŕ\ }12@ʊԂ̊֌WUz}ŕ\

@ʊԂ̊֌Weʂ̕zڂŊmFł܂BΊp̃ZiF1s1ڂ́mSepal Lengthnm̃yAjł́ueʂ̕zvqXgOŁȂ̃ZiF1s2ڂ́mSepal LengthnƁmSepal WidthñyAjł́u2‚̓ʊԂ̊֌WvUz}ŕ`悳܂B

@̂悤pairplot֐𗘗pƁAʊԂ̑ւAž`Aُl̗LȂǁAf[^Zbg̊Tvfcł܂B

@@BwKvWFNgł́AiOɂĕiサjf[^Zbg[邱ƂAʂ̑InCp[p[^[iPOɐlԂɂĎw肷ݒ荀ځj̃`[jOiĵ߂ɔɏdvłB̍ƂTIf[^iEDAFExploratory Data AnalysisjƌĂ΂܂ȀiKł̊֐𗧂܂B


荂ȐlvZiNumPygpj

@Of̃Xg6łpandasgēvʂ߂܂BAɑʂ̃f[^ŐlvZz񑀍ۂɂ́ApandasNumPy̕ǂꍇ̂ŁANumPy߂łB

@lvZCuuNumPyvndarrayƌĂ΂zf[^́ApandasDataFrameɔׂāAANZX␔lvZ荂łB́AndarrayȋSĂ̒ljlȃf[^^ō\A჌x̍œK{Ă邽߂łB


rbN}iu

@ɕM҂ColabőxrƂA2f[^iNumPy ndarray vs. pandas DataFramejł́ANumPypandasuzvfւ̃ANZXv͖200{Auς̌vZv͖30{ʂA܂1f[^iNumPy ndarray vs. pandas Seriesjł́ANumPypandasuzvfւ̃ANZXv͖1000{Auς̌vZv͖8{Au|ŽvZv͖100{ʂo܂iQlFTvm[gubNŎ܂jBAʂ͎s‹f[^eɂĕς܂̂łӂB


@pandaśA܂łɐf[^̑O╪͂̏iKŕ֗łBpandasNumPy̗CúApV[œK؂ɎgƂ悢ł傤B

@ł́ApandasőOsf[^ndarrayɕϊAςvZĂ݂܂iXg10jB

import numpy as np

# pandasDataFramendarrayiNumPyzjɕϊ
features_array = df_features.to_numpy(dtype='float32')

# NumPyŊe̕ϒlvZ
mean_features = np.mean(features_array, axis=0)
mean_features  # o͗F array([5.8510065, 3.0563755, 3.7744968, 1.2060403], dtype=float32)

Xg10@NumPyōɐlvZ

@numpyW[C|[gƂ̕ʖ́AʏAnpƂ܂B

@pandasto_numpy()\bhŁADataFrameNumPyndarrayɕϊł܂BndarrayDataFrameƈقȂAlȃf[^^ɓꂷKv邽߁Adtype='float32'w肵ĖIɃf[^^𓝈ꂵĂ܂B

@ɁANumPy̐lvZ\bḧƂnp.mean()\bhĂяoĂ܂BvZʂ́Apandasg}9meanƓłB

yRzKWsɂscikit-learn֗

@@BwKł́Aeʁif[^j̃XP[iPʁj𓝈ꂷƂłKiNormalizationjsƂŁAwǨ@BwKf̐\オ҂ł܂BK̑\IȎ@̈‚WiStandardizationjłB

@ẂAeʂŕρi‚܂f[^̕z̒Sju0vɁAW΍i‚܂蕽ς̃f[^̂΂‚ju1ṽXP[ɕϊiXP[Oj@łB̌vŹAf[^畽ϒlŁAW΍Ŋ邾iF(data - mean) / stdjȂ̂ŁANumPypandasłȒPɎł܂B

@scikit-learng΂ƊȒPłBsklearn.preprocessingW[ɗpӂꂽA܂܂ȐK̂߂̃NXpł܂BWɂ́AStandardScalerNXg܂iXg11jB

from sklearn.preprocessing import StandardScaler

# scikit-learnőSĂ̓ʂW
scaler = StandardScaler()
sk_scaled = scaler.fit_transform(features_array)
sk_scaled[:2# 擪2s\
# o͗F
# array([[-0.9128386,  1.0181674, -1.353994 , -1.3275825],
#        [-1.1559356, -0.1293889, -1.353994 , -1.3275825]], dtype=float32)

Xg11@scikit-learnŕWR[h

@fit_transform()\bhɂAf[^ϊȉꍇ͕Wj܂BẂAf[^Zbg̑Sʁi̗łfeatures_arrayjɑ΂Ă܂Ƃ߂čŝʓIłB

@ӓ_ƂāAscikit-learnNumPypandasł́AW΍̌vZ@ɈقȂ”\܂Bscikit-learnł̓ftHgŕWc̕W΍gpANumPypandasł͈ddof=1w肷邱ƂsΕW΍iW{Wc̕W΍𐄒肵ljIł܂Bddof=0w肷ƁAscikit-learnƓW΍ɂȂ܂B

@̗ƂāANumPypandasł(data - data.mean(axis=0)) / data.std(axis=0, ddof=0)ƂR[hŕWł܂Baxis=0́Ai‚܂ʂƁǰvZӖ܂iQƁFNumPỹwvApandas̃wvjB_ȉ̊ۂߌ덷ȂǂɂAscikit-learnNumPypandašvZʂ́ASɈvȂƂ܂B

@̂悤Ȑl̕ϊ@ƒm肽ꍇ́A̋ĹuʃGWjAOv\mlϐn̐QlɂĂB


@܂ŁAuf[^̓ǂݍ݁vAlُl̏ƂuOvsAf[^̕iコĂ܂B܂ApIɂȂ܂Af[^ŽȂǂɂf[^Zbg[uTIf[^́vƁAʂ̐VK쐬IȂǂsuʃGWjAOvɂ‚ĂARŏЉ܂B͂A@BwKf̐\コ邽߂ɏdvȍƂłB́A悢@BwK̒iKɓ܂B

5. @BwK̏Ff[^PpƃeXgpɕ悤iscikit-learngpj

@@BwKfPOɁAf[^Ppf[^ZbgiTraining setFPZbgjeXgpf[^ZbgiTest setFeXgZbgjɕ邱ƂʓIłB́AfwKɎgpĂȂum̃f[^vɑ΂Ăǂꂾ܂\ł邩AȂ킿fĉ\i͂񂩂̂j𐳊mɕ]邽߂łB

@f̌PvZX番ƗeXgZbgpӂ邱ƂŁAuE̖m̃f[^ɑ΂郂f̐^̐\viĉ\jcł悤ɂȂ܂BɂAfPf[^ߏɓKiuOver-fittingFߊwKvƂĂ΂܂jĂ܂oł悤ɂȂ܂B

@ṓAf[^90PZbgɁA10eXgZbgɊ蓖Ă܂i}13jB̊ɖmȊ͂܂񂪁A̓f[^149ƏȂ̂ŁAeXgZbg͏Ȃ߂ɂ܂BʓIɂ80F2070F30悤łBPZbg𑽂ƊwK₷ȂAeXgZbg𑽂Ɣĉ\K؂ɕ]₷Ȃƍl܂B

}13@̃f[^Zbg͌PZbg90ƃeXgZbg102 }13@̃f[^Zbg͌PZbg90ƃeXgZbg102

@scikit-learnCug΁Af[^ȒPɍs܂B̓Iɂsklearn.model_selectionW[train_test_split()֐ŁAf[^ZbgPpƃeXgpɕł܂iXg12jB

from sklearn.model_selection import train_test_split

X = df_features  # ̓f[^iXFʁjFf֓͂ϐ
y = df_dropped['Class_ID'] # liyFxjF\ړIϐ

# f[^PZbgƃeXgZbgɕieXgZbg͑Ŝ10ɐݒj
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# PZbg̓ʂƃxāA擪5s\
pd.concat([X_train, y_train], axis=1).head()

Xg12@scikit-learnŃf[^PpƃeXgpɕR[h

@̓f[^ƂȂ4‚̓ʂXƂϐɁAfɂ\ʂƔr邽߂̐lƂȂ郉xiړIϐjyƂϐɊ蓖ĂĂ܂B́AwӎŁA啶X͍sf[^i2̔zf[^łDataFramejAy̓xNgf[^i1̔zf[^łSeriesjӖ܂B

@train_test_split()֐̈ɂ́AXyɉāAtest_size=0.1w肳Ă܂B̓f[^10i0.1jeXgZbgɂ邱ƂӖ܂B

@Of̐}3Ō悤ɁAIrisf[^Zbg͐擪SāusetosavɂȂĂȂǁȀŕłAf[^΂Ă܂B̂߁Af[^̃VbtsŒłBtrain_test_split()֐́AftHgshuffle=Trueݒ肳ĂAw肵ȂĂIɃf[^Vbt܂B

@random_state=42́AV[hƌĂ΂A_ɃVbtۂ̊ƂȂ鐔lłB̗V[hlw肷邱ƂŁAʂ̍Čۏ؂܂B‚܂AV[hgp邱ƂŁA̐lR[hsۂɂAf[^̕ʂ悤ɂȂ܂B


j}Ί̃}iu

@V[hɂ42Ƃl悭g܂Bw̓qb`nCNEKChxi_OXEA_Xj̒ŁAX[p[Rs[^750Nāu^F^ɂ‚Ă̋ɂ̋^ɑ΂铚vvZʂ42łB̐ĺAȊwZp̕ł悭pAy݂Ȃʂ̍Čmۂ邽߂̈̓`ƂȂĂ܂B


@train_test_split()֐́APZbg̓ʁiX_trainjAeXgZbg̓ʁiX_testjAPZbg̃xiy_trainjAeXgZbg̃xiy_testjƂ4vf̃XgԂ܂B

pd.concat()֐ŁAPZbg̓ʂƃx1‚DataFrameɘAĂ܂Baxis=1͗ɘA邱ƂӖ܂B

ؗpf[^ZbgKvȗR

@@BwKfœKA̐\őɈo߂ɂ́AK؂ȃnCp[p[^[̃`[jOsŒłB`[jOɂ́Af[^PZbgA؃ZbgiValidation setFؗpf[^ZbgjAeXgZbg3‚ɕKv܂B

@nCp[p[^[̃`[jOɃeXgZbggpƁAeXg̃f[^umvłȂȂĂ܂܂B‚܂ĉ\K؂ɕ]łȂȂ܂BāAeXgZbg͎g킸Ɂumv̂܂܎cāA`[jOɎg؃ZbgVɕKvɂȂƂ킯łB

@@BwK̉Lł́A؃ZbgpӂĂȂꍇłA̓nCp[p[^[`[jOȂ߂ƍl܂B{AڂłA{Iɂ3ɁAPZbgƃeXgZbg2ōς܂悤ɂ܂Bۂ̃vWFNgł́A؃Zbg܂߂3܂B

@؃Zbǵ̕AeXgZbgƓ炢ɂ̂ʓIłBɏ]́APZbg80A؃ZbgƃeXgZbgꂼ10‚̊ŕ܂i}14jB

}14@̃f[^Zbg͌PZbg90ƌ؃Zbg10ƃeXgZbg103 }14@̃f[^Zbg͌PZbg90ƌ؃Zbg10ƃeXgZbg103

@́AeXgZbg10ɕς݂Ȃ̂ŁAPZbg炳Ɍ؃Zbg𕪊܂B90̂10𕪊̂train_test_split()֐̈ɂtest_size=1/9Ǝw肵܂iXg13jB1/9i1090jƕ`ɂĂ̂́A0.111...ƂlɂƈӐ}ȂƍlłB

# PZbg炳Ɍ؃Zbg𕪊i؃ZbgŜ10ɐݒj
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=1/9, random_state=42)

Xg13@scikit-learnŌPp̃f[^猟ؗp𕪊R[h

@X_valid؃Zbg̓ʂŁAy_valid؃Zbg̃xłBȏŁAf[^̏܂B

yRzz[hAEg؂ƌ؂̈Ⴂ

@ŒIɌPZbg^؃Zbg^eXgZbg3i͌PZbg^؃Zbg2jāA@BwKf̐\]@z[hAEgiHold-out ValidationjƌĂт܂B

@z[hAEg؂́Aŕ₷łÂ悤ɏȂf[^̃f[^Zbg𕪊ƁÅeZbg̃f[^ɏȂȂĂ܂肪܂B܂AuVbtĂvƂ͌ĂAŒIȃf[^ŌP]邱ƂɂȂ̂ŁA΂wK]ʂɂȂ”\ےł܂B

@ł΁uSẴf[^v]ƂȂgČPA΂肪Ȃ悤oXǂ]邱Ƃ]܂ł傤B”\ɂ̂AiCVFCross ValidationANXof[VjłB؂łAŏI]̂߂ɎOɃeXgZbg͕ĎcĂ悤ɂ܂傤B

@؂́AɃf[^ʂȂꍇɗLłAvZRXg]vɂfbg܂B_ŕ΂̂Ȃf[^ʂɂȂAz[hAEg؂ŏ\łB

@؂̈@k-fold܂B̎@ł́Af[^Zbg𓙕k‚̃tH[hifoldFjɕA1‚̃tH[h؃ZbgƂāAck|1‚̃tH[hPZbgƂĎgp܂i}15jB̃vZXkJԂAetH[hx͌؃ZbgƂėp悤ɂ܂B

}15@k-fold؂5Ƃ̌PZbgƌ؃Zbg }15@k-fold؂5Ƃ̌PZbgƌ؃Zbg

@}15k5̏ꍇŁAuf1`5vƕ\悤5‚̋@BwKfŌPƌ؁i]jłBႦ΁uÁṽfȂAˆقȂ5‚̌Pς݃f쐬܂Bef؃Zbgŕ]āA̕ς邱ƂŁAIȃf̐\]܂B

@؂́AK؂ȃnCp[p[^[lȋgݍ킹jTړIŖ𗧂܂Bscikit-learnɂ́A̖ړIŎgsklearn.model_selectionW[GridSearchCVNXpӂĂ܂iQlLjBɂ茩‚œKȃnCp[p[^[lŁASẮuPZbg{؃ZbgvgčČP1‚̋@BwKf쐬̂AʓIȎp@̈‚łB

@܂Ak-fold؂ō쐬k‚̃fSĎgAeXgZbgɑ΂k‚̗\l擾āA𕽋ς邱ƂȂǁiATuwKƌĂ΂܂j1‚̗\lƂ邱ƂAʓIȎp@̈‚łB̕@́A@BwK̋ZłKaggleRyeBVł悭̗pĂ܂Bۂ̋LŎĂ̂ŎQlɂĂB


6. @BwK̎HFP`\`]悤iscikit-learn̎gj

@łA悢Ō̃XebvłB́A@BwKɋʂ闬Љ܂B•ʂ̋@BwKASY̓eɂ‚ẮAȍ~Ō•ʂɏڂĂ܂B

@@BwKvWFNgł́A@BwK̃ASYiFÁjIAPZbgpċ@BwKfPifitjAeXgZbgpČPς݃fɂ\ipredictjs܂Bscikit-learnCugāÅ{IȗȌɗ܂傤BȂÃvWFNgł́APɌ؃ZbgpănCp[p[^[̃`[jOs܂A܂ߖ{Aڂł͊{Iɏȗ܂B

@͋@BwK̃ASYƂāAf[̎dɎgꂽƂŗLiC[uxCYފiPxCYފjgp܂Bscikit-learnł́Asklearn.naive_bayesW[GaussianNBNXƂāA̋@\񋟂Ă܂B

@܂́A@BwK̃ASY@I܂iXg14jB

from sklearn.naive_bayes import GaussianNB

# @BwK̃ASYI
model = GaussianNB(var_smoothing=1e-9# iC[uxCYފ

Xg14@scikit-learnŋ@BwK̃ASYIR[h

var_smoothingp[^[́AiC[uxCYފɂ镪U𕽊邽߂̃nCp[p[^[łBɌPZbgȂꍇɋNߊwKiߏKj̖hAf̈萫コ邽߂Ɏgp܂BftHgl1e-9i00.000000001jłB

@ɁAPZbg@BwKfimodeljɓ͂āAfP܂iXg15jB

# PZbg͂āA@BwKfP
model.fit(X_train, y_train)

Xg15@scikit-learnŋ@BwKfPR[h

@scikit-learnł͊{IɁAPmodel.fit()\bhōs܂Bfit()\bhɓ͂ĂPZbǵApandas DataFrameX_trainSeriesy_trainłBscikit-learn̓ł́A{INumPyndarraygĂ܂Ao͂ɂpandasDataFrameȂǂT|[gĂ܂B

@ɁAeXgZbg@BwKfimodeljɓ͂āAPς݃fŗ\܂iXg16jB

# eXgZbg͂āAPς݃fŗ\
pred_test = model.predict(X_test)

pred_test
# o͗F array([1, 0, 2, 1, 2, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0], dtype=int64)

Xg16@scikit-learnŌPς݃fgė\R[h

@scikit-learnł͊{IɁA\model.predict()\bhōs܂Bpredict()\bhɓ͂ĂeXgZbǵApandas DataFrameX_testłB̂悤ɕ̃f[^܂Ƃ߂ė\ł܂B

@predict()\bh̖߂lƂāANumPyndarray1̔zf[^ԂĂ܂Bɂ́A\ʂeXgZbg̃f[^Ɋi[Ă܂B

@̗\ʁipred_testjƃeXgZbg̐liy_testj𓚂킹āA̐𗦁iAccuracyj]܂iXg17jB

from sklearn.metrics import accuracy_score

# eXgZbgɂA\ʂ̕]
print(f'Accuracy:{accuracy_score(y_test, pred_test)}')
# o͗F Accuracy:0.9333333333333333

Xg17@scikit-learnŋ@BwKf̐\]R[h

@scikit-learnł́A̕]sklearn.metricsW[accuracy_score()֐ōs܂BȂA]wWɂ͐𗦈ȊOɂKF1XRAȂǑɂ܂܂Ȃ̂܂iQlLjB

@accuracy_score()֐̖߂lƂāAfloatlԂĂ܂B0.9333...́u93v̐𗦂Ӗ܂B́Aɍ𗦂ɂȂ܂B

@eXgOɂnCp[p[^[`[jOɂ́AXg14ɂnCp[p[^[var_smoothinglςāAXg15̌PsĂ݂ĂBXg16`17Ɠ@Ō؃ZbgiX_validy_validjgāA\A]܂B𗦂荂lɂȂ悤ɁA̎菇JԂƂōœKȃnCp[p[^[l‚o܂B

@Ȃ݂Ɍ؃ZbggāA\A]Ɛ𗦂100ł̂ŁAIrisf[^ZbgƃiC[uxCYފ̑gݍ킹ł́AnCp[p[^[`[jO]n͂قƂǂȂłB


@́APythonɂ@BwK̊{Iȗ܂B̓eɃR[hĂ邱ƂD܂łBMȂ΁AЉxŏ蒼Ă݂ĂB܂A]T΁A͎̎Ă݂ĂB

@HIȋ@BwK̎菇wтluwKaggleu~RyQxŋ@BwKn߂悤vƂAڋLǂ邱Ƃ߂܂B

@񂩂́A̓Iȋ@BwK̎@iF`AA؁Ak-meansȂǁjĂ܂Be@BwK̃vO~Oɋʂê͍ŁA񂩂炻Əd͊{Iɏȗ܂B

@͐`APythonŃvO~OĂ݂܂By݂ɁB

͎NCY

@IWF̕NbN܂̓^bvƓ\܂Bqg~ꍇ́AΐF̕NbNĂBߖɎgI\܂B

@@BwKł́A܂PythoñCuupandasvread_csv()֐găf[^ǂݍ݁ADataFrameƌĂ΂2i\`jf[^̃IuWFNgƂĎ擾܂B

@ǂݍ񂾃f[^ɂ́Aُl⌇l̏AJeS[l̐lւ̒uȂǁAOɃf[^f[^͂@BwKɓK`ɐOsŒłB

@ُlOĺAf[^OtƂŽƈڂ傤RłBPythonŊ{IȃOt`悷ɂ́ACuuMatplotlibvg܂B

@@BwKvWFNgł́Af[^Zbg[邱ƂdvłB̍ƂTIf[^iEDAjƌĂ΂܂B́AʂI쐬肷ʃGWjAOɂ𗧂܂B

@荂xȃOt̕`ɂ́AʊԂ̊֌Weʂ̕zɊւ鑽̃Otxɍ쐬łpairplot()֐Ȃǂ郉CuuseabornvLpłB

@ȐlvZɂNumPyndarrayƌĂ΂zf[^KĂ܂BCu͖ړIɉĎĝ߂łB

@f[^ZbǵAPɎguPZbgvƁAnCp[p[^[̃`[jOɎgu؃ZbgvƁAĉ\]邽߂ɎgueXgZbgv3܂傤B

@@BwK̊{Iȗł́A@BwK̃ASY@IAfPAPς݃f\ʂƐlׂĐ\]܂B

qgF @TableFrame@@ndimdata@@TIf[^́@@pandas@@\@@@@scikit-learn@@mFZbg@@؃Zbg@@seaborn@@Dask@@Bokeh@@f[^}CjO@@O@@ndarray@@ʃGWjAO@@DataFrame@@Ž@@Matplotlib@@ĉ\@@f[^pCvC@@t@C`[jO@

u@BwKṽCfbNX

u@BwKv

Copyright© Digital Advantage Corp. All Rights Reserved.

X|T[̂m点PR

ڂ̃e[}

Microsoft  WindowsőO2025
AI for GWjAO
[R[h^m[R[h Zg by IT - ITGWjArWlX̒SŊ􂷂gD
Cloud Native Central by IT - XP[uȔ\͂gD
VXeJmEnE yirzPR
Ȃɂ߂̋LPR

RSSɂ‚

ACeBfBAIDɂ‚

[}KWo^

IT̃[}KẂA AׂĖłBЃ[}KWwǂB