From ea2a57cab95db313b26ee439d5f6da8f958fdaf9 Mon Sep 17 00:00:00 2001 From: Pedro Folloni Pesserl <fpesserl7@gmail.com> Date: Sun, 26 Feb 2023 02:10:04 -0300 Subject: [PATCH] almost works --- tshell/tshell_p3.sh | 99 +++++++++++++++++++-------------------------- 1 file changed, 41 insertions(+), 58 deletions(-) diff --git a/tshell/tshell_p3.sh b/tshell/tshell_p3.sh index fb3d7a1..eb3323f 100755 --- a/tshell/tshell_p3.sh +++ b/tshell/tshell_p3.sh @@ -37,66 +37,49 @@ CSV=$(realpath $CSV) # converter o arquivo xml.gz para o formato csv zcat $XML | \ - xgrep -tx "//PMID|//ArticleTitle|//Abstract|//MeshHeadingList" | \ - # o sed acha os campos úteis -- PMID seguido de ArticleTitle seguido - # de Abstract, seguido ou não de MeshHeadingList -- e converte no - # formato csv. No caso de não haver MeshHeadingList para um dado artigo, - # cria um campo vazio (<). - sed -En ' - /PMID/{ - s/<PMID Version="//; - s/">//; - s/<\/PMID>//; - N; +xgrep -tx "//PMID|//ArticleTitle|//Abstract|//MeshHeadingList" | \ +# o sed acha os campos úteis -- PMID seguido de ArticleTitle seguido +# de Abstract, seguido ou não de MeshHeadingList -- e converte no +# formato csv. No caso de não haver MeshHeadingList para um dado artigo, +# insere uma linha vazia. +sed -En ' +:x; /<PMID/{ + s/<PMID Version="//; + s/">//; + s/<\/PMID>/</; + x; + n; - /ArticleTitle/{ - s/<ArticleTitle>/</; - s/<\/ArticleTitle>//; - N; + /<ArticleTitle>/{ + s/<ArticleTitle>//; + s/ *<\/ArticleTitle>/</; + H; + n; - /Abstract/{ - s/<Abstract> *<AbstractText>/</; - s/<\/AbstractText> *<\/Abstract>//; - p; - n; - - /MeshHeadingList/{ - s/ *<MeshHeading> *<Descriptor[^>]*>//g; - s/(<\/DescriptorName>|<\/QualifierName>) *(<Qualifier[^>]*>|<\/MeshHeading>)/, /g; - s/<MeshHeadingList>/</; - s/(, *|)<\/MeshHeadingList>//; - p; - }; - - /PMID/{ - i < - } - } - } - }' >> $CSV + /<Abstract>/{ + s/(<Abstract>|<AbstractText[^>]*>) *//g; + s/ *<\/AbstractText> *(|<CopyrightInformation>)/, /g; + s/(,|<\/CopyrightInformation>) *<\/Abstract>/</g; + s/(<sup>|<sub>)//g; + s/(<\/sup>|<\/sub>)/, /g; + H; + x; + p; + n; -# converter para o formato csv -# sed -i ' -# /PMID/{ -# s/<PMID Version="//; -# s/">//; -# s/<\/PMID>//; -# } - -# /ArticleTitle/{ -# s/<ArticleTitle>/</; -# s/<\/ArticleTitle>//; -# } - -# /Abstract/{ -# s/<Abstract> *<AbstractText>/</; -# s/<\/AbstractText> *<\/Abstract>//; -# } + /<MeshHeadingList>/{ + s/(<\/DescriptorName>|<\/QualifierName>) *(<Qualifier[^>]*>|<\/MeshHeading> *)/, /g; + s/(<MeshHeadingList> *|<MeshHeading> *<Descriptor[^>]*>|(, *|)<\/MeshHeadingList>)//g; + p; + d; + }; -# /MeshHeadingList/{ -# s/ *<MeshHeading> *<Descriptor[^>]*>//g; -# s/\(<\/DescriptorName>\|<\/QualifierName>\) *\(<Qualifier[^>]*>\|<\/MeshHeading>\)/, /g; -# s/<MeshHeadingList>/</; -# s/\(, *\|\)<\/MeshHeadingList>//; -# }' $CSV + i \ + bx; + }; + bx; + }; + bx; +}' | \ +awk 'BEGIN {RS=""} {gsub(/<\n/, "<", $0); print $0}' >> $CSV -- GitLab